282 282 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 | // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) "callthunks: " fmt #include <linux/debugfs.h> #include <linux/kallsyms.h> #include <linux/memory.h> #include <linux/moduleloader.h> #include <linux/static_call.h> #include <asm/alternative.h> #include <asm/asm-offsets.h> #include <asm/cpu.h> #include <asm/ftrace.h> #include <asm/insn.h> #include <asm/kexec.h> #include <asm/nospec-branch.h> #include <asm/paravirt.h> #include <asm/sections.h> #include <asm/switch_to.h> #include <asm/sync_core.h> #include <asm/text-patching.h> #include <asm/xen/hypercall.h> static int __initdata_or_module debug_callthunks; #define MAX_PATCH_LEN (255-1) #define prdbg(fmt, args...) \ do { \ if (debug_callthunks) \ printk(KERN_DEBUG pr_fmt(fmt), ##args); \ } while(0) static int __init debug_thunks(char *str) { debug_callthunks = 1; return 1; } __setup("debug-callthunks", debug_thunks); #ifdef CONFIG_CALL_THUNKS_DEBUG DEFINE_PER_CPU(u64, __x86_call_count); DEFINE_PER_CPU(u64, __x86_ret_count); DEFINE_PER_CPU(u64, __x86_stuffs_count); DEFINE_PER_CPU(u64, __x86_ctxsw_count); EXPORT_PER_CPU_SYMBOL_GPL(__x86_ctxsw_count); EXPORT_PER_CPU_SYMBOL_GPL(__x86_call_count); #endif extern s32 __call_sites[], __call_sites_end[]; struct core_text { unsigned long base; unsigned long end; const char *name; }; static bool thunks_initialized __ro_after_init; static const struct core_text builtin_coretext = { .base = (unsigned long)_text, .end = (unsigned long)_etext, .name = "builtin", }; asm ( ".pushsection .rodata \n" ".global skl_call_thunk_template \n" "skl_call_thunk_template: \n" __stringify(INCREMENT_CALL_DEPTH)" \n" ".global skl_call_thunk_tail \n" "skl_call_thunk_tail: \n" ".popsection \n" ); extern u8 skl_call_thunk_template[]; extern u8 skl_call_thunk_tail[]; #define SKL_TMPL_SIZE \ ((unsigned int)(skl_call_thunk_tail - skl_call_thunk_template)) extern void error_entry(void); extern void xen_error_entry(void); extern void paranoid_entry(void); static inline bool within_coretext(const struct core_text *ct, void *addr) { unsigned long p = (unsigned long)addr; return ct->base <= p && p < ct->end; } static inline bool within_module_coretext(void *addr) { bool ret = false; #ifdef CONFIG_MODULES struct module *mod; preempt_disable(); mod = __module_address((unsigned long)addr); if (mod && within_module_core((unsigned long)addr, mod)) ret = true; preempt_enable(); #endif return ret; } static bool is_coretext(const struct core_text *ct, void *addr) { if (ct && within_coretext(ct, addr)) return true; if (within_coretext(&builtin_coretext, addr)) return true; return within_module_coretext(addr); } static bool skip_addr(void *dest) { if (dest == error_entry) return true; if (dest == paranoid_entry) return true; if (dest == xen_error_entry) return true; /* Does FILL_RSB... */ if (dest == __switch_to_asm) return true; /* Accounts directly */ if (dest == ret_from_fork) return true; #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_AMD_MEM_ENCRYPT) if (dest == soft_restart_cpu) return true; #endif #ifdef CONFIG_FUNCTION_TRACER if (dest == __fentry__) return true; #endif #ifdef CONFIG_KEXEC_CORE if (dest >= (void *)relocate_kernel && dest < (void*)relocate_kernel + KEXEC_CONTROL_CODE_MAX_SIZE) return true; #endif #ifdef CONFIG_XEN if (dest >= (void *)hypercall_page && dest < (void*)hypercall_page + PAGE_SIZE) return true; #endif return false; } static __init_or_module void *call_get_dest(void *addr) { struct insn insn; void *dest; int ret; ret = insn_decode_kernel(&insn, addr); if (ret) return ERR_PTR(ret); /* Patched out call? */ if (insn.opcode.bytes[0] != CALL_INSN_OPCODE) return NULL; dest = addr + insn.length + insn.immediate.value; if (skip_addr(dest)) return NULL; return dest; } static const u8 nops[] = { 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, }; static void *patch_dest(void *dest, bool direct) { unsigned int tsize = SKL_TMPL_SIZE; u8 insn_buff[MAX_PATCH_LEN]; u8 *pad = dest - tsize; memcpy(insn_buff, skl_call_thunk_template, tsize); apply_relocation(insn_buff, tsize, pad, skl_call_thunk_template, tsize); /* Already patched? */ if (!bcmp(pad, insn_buff, tsize)) return pad; /* Ensure there are nops */ if (bcmp(pad, nops, tsize)) { pr_warn_once("Invalid padding area for %pS\n", dest); return NULL; } if (direct) memcpy(pad, insn_buff, tsize); else text_poke_copy_locked(pad, insn_buff, tsize, true); return pad; } static __init_or_module void patch_call(void *addr, const struct core_text *ct) { void *pad, *dest; u8 bytes[8]; if (!within_coretext(ct, addr)) return; dest = call_get_dest(addr); if (!dest || WARN_ON_ONCE(IS_ERR(dest))) return; if (!is_coretext(ct, dest)) return; pad = patch_dest(dest, within_coretext(ct, dest)); if (!pad) return; prdbg("Patch call at: %pS %px to %pS %px -> %px \n", addr, addr, dest, dest, pad); __text_gen_insn(bytes, CALL_INSN_OPCODE, addr, pad, CALL_INSN_SIZE); text_poke_early(addr, bytes, CALL_INSN_SIZE); } static __init_or_module void patch_call_sites(s32 *start, s32 *end, const struct core_text *ct) { s32 *s; for (s = start; s < end; s++) patch_call((void *)s + *s, ct); } static __init_or_module void patch_alt_call_sites(struct alt_instr *start, struct alt_instr *end, const struct core_text *ct) { struct alt_instr *a; for (a = start; a < end; a++) patch_call((void *)&a->instr_offset + a->instr_offset, ct); } static __init_or_module void callthunks_setup(struct callthunk_sites *cs, const struct core_text *ct) { prdbg("Patching call sites %s\n", ct->name); patch_call_sites(cs->call_start, cs->call_end, ct); patch_alt_call_sites(cs->alt_start, cs->alt_end, ct); prdbg("Patching call sites done%s\n", ct->name); } void __init callthunks_patch_builtin_calls(void) { struct callthunk_sites cs = { .call_start = __call_sites, .call_end = __call_sites_end, .alt_start = __alt_instructions, .alt_end = __alt_instructions_end }; if (!cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) return; pr_info("Setting up call depth tracking\n"); mutex_lock(&text_mutex); callthunks_setup(&cs, &builtin_coretext); thunks_initialized = true; mutex_unlock(&text_mutex); } void *callthunks_translate_call_dest(void *dest) { void *target; lockdep_assert_held(&text_mutex); if (!thunks_initialized || skip_addr(dest)) return dest; if (!is_coretext(NULL, dest)) return dest; target = patch_dest(dest, false); return target ? : dest; } #ifdef CONFIG_BPF_JIT static bool is_callthunk(void *addr) { unsigned int tmpl_size = SKL_TMPL_SIZE; u8 insn_buff[MAX_PATCH_LEN]; unsigned long dest; u8 *pad; dest = roundup((unsigned long)addr, CONFIG_FUNCTION_ALIGNMENT); if (!thunks_initialized || skip_addr((void *)dest)) return false; pad = (void *)(dest - tmpl_size); memcpy(insn_buff, skl_call_thunk_template, tmpl_size); apply_relocation(insn_buff, tmpl_size, pad, skl_call_thunk_template, tmpl_size); return !bcmp(pad, insn_buff, tmpl_size); } int x86_call_depth_emit_accounting(u8 **pprog, void *func) { unsigned int tmpl_size = SKL_TMPL_SIZE; u8 insn_buff[MAX_PATCH_LEN]; if (!thunks_initialized) return 0; /* Is function call target a thunk? */ if (func && is_callthunk(func)) return 0; memcpy(insn_buff, skl_call_thunk_template, tmpl_size); apply_relocation(insn_buff, tmpl_size, *pprog, skl_call_thunk_template, tmpl_size); memcpy(*pprog, insn_buff, tmpl_size); *pprog += tmpl_size; return tmpl_size; } #endif #ifdef CONFIG_MODULES void noinline callthunks_patch_module_calls(struct callthunk_sites *cs, struct module *mod) { struct core_text ct = { .base = (unsigned long)mod->mem[MOD_TEXT].base, .end = (unsigned long)mod->mem[MOD_TEXT].base + mod->mem[MOD_TEXT].size, .name = mod->name, }; if (!thunks_initialized) return; mutex_lock(&text_mutex); callthunks_setup(cs, &ct); mutex_unlock(&text_mutex); } #endif /* CONFIG_MODULES */ #if defined(CONFIG_CALL_THUNKS_DEBUG) && defined(CONFIG_DEBUG_FS) static int callthunks_debug_show(struct seq_file *m, void *p) { unsigned long cpu = (unsigned long)m->private; seq_printf(m, "C: %16llu R: %16llu S: %16llu X: %16llu\n,", per_cpu(__x86_call_count, cpu), per_cpu(__x86_ret_count, cpu), per_cpu(__x86_stuffs_count, cpu), per_cpu(__x86_ctxsw_count, cpu)); return 0; } static int callthunks_debug_open(struct inode *inode, struct file *file) { return single_open(file, callthunks_debug_show, inode->i_private); } static const struct file_operations dfs_ops = { .open = callthunks_debug_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; static int __init callthunks_debugfs_init(void) { struct dentry *dir; unsigned long cpu; dir = debugfs_create_dir("callthunks", NULL); for_each_possible_cpu(cpu) { void *arg = (void *)cpu; char name [10]; sprintf(name, "cpu%lu", cpu); debugfs_create_file(name, 0644, dir, arg, &dfs_ops); } return 0; } __initcall(callthunks_debugfs_init); #endif |
1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Pixart PAC7302 driver * * Copyright (C) 2008-2012 Jean-Francois Moine <http://moinejf.free.fr> * Copyright (C) 2005 Thomas Kaiser thomas@kaiser-linux.li * * Separated from Pixart PAC7311 library by Márton Németh * Camera button input handling by Márton Németh <nm127@freemail.hu> * Copyright (C) 2009-2010 Márton Németh <nm127@freemail.hu> */ /* * Some documentation about various registers as determined by trial and error. * * Register page 0: * * Address Description * 0x01 Red balance control * 0x02 Green balance control * 0x03 Blue balance control * The Windows driver uses a quadratic approach to map * the settable values (0-200) on register values: * min=0x20, default=0x40, max=0x80 * 0x0f-0x20 Color and saturation control * 0xa2-0xab Brightness, contrast and gamma control * 0xb6 Sharpness control (bits 0-4) * * Register page 1: * * Address Description * 0x78 Global control, bit 6 controls the LED (inverted) * 0x80 Compression balance, 2 interesting settings: * 0x0f Default * 0x50 Values >= this switch the camera to a lower compression, * using the same table for both luminance and chrominance. * This gives a sharper picture. Only usable when running * at < 15 fps! Note currently the driver does not use this * as the quality gain is small and the generated JPG-s are * only understood by v4l-utils >= 0.8.9 * * Register page 3: * * Address Description * 0x02 Clock divider 3-63, fps = 90 / val. Must be a multiple of 3 on * the 7302, so one of 3, 6, 9, ..., except when between 6 and 12? * 0x03 Variable framerate ctrl reg2==3: 0 -> ~30 fps, 255 -> ~22fps * 0x04 Another var framerate ctrl reg2==3, reg3==0: 0 -> ~30 fps, * 63 -> ~27 fps, the 2 msb's must always be 1 !! * 0x05 Another var framerate ctrl reg2==3, reg3==0, reg4==0xc0: * 1 -> ~30 fps, 2 -> ~20 fps * 0x0e Exposure bits 0-7, 0-448, 0 = use full frame time * 0x0f Exposure bit 8, 0-448, 448 = no exposure at all * 0x10 Gain 0-31 * 0x12 Another gain 0-31, unlike 0x10 this one seems to start with an * amplification value of 1 rather then 0 at its lowest setting * 0x21 Bitfield: 0-1 unused, 2-3 vflip/hflip, 4-5 unknown, 6-7 unused * 0x80 Another framerate control, best left at 1, moving it from 1 to * 2 causes the framerate to become 3/4th of what it was, and * also seems to cause pixel averaging, resulting in an effective * resolution of 320x240 and thus a much blockier image * * The registers are accessed in the following functions: * * Page | Register | Function * -----+------------+--------------------------------------------------- * 0 | 0x01 | setredbalance() * 0 | 0x03 | setbluebalance() * 0 | 0x0f..0x20 | setcolors() * 0 | 0xa2..0xab | setbrightcont() * 0 | 0xb6 | setsharpness() * 0 | 0xc6 | setwhitebalance() * 0 | 0xdc | setbrightcont(), setcolors() * 3 | 0x02 | setexposure() * 3 | 0x10, 0x12 | setgain() * 3 | 0x11 | setcolors(), setgain(), setexposure(), sethvflip() * 3 | 0x21 | sethvflip() */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/input.h> #include "gspca.h" /* Include pac common sof detection functions */ #include "pac_common.h" #define PAC7302_RGB_BALANCE_MIN 0 #define PAC7302_RGB_BALANCE_MAX 200 #define PAC7302_RGB_BALANCE_DEFAULT 100 #define PAC7302_GAIN_DEFAULT 15 #define PAC7302_GAIN_KNEE 42 #define PAC7302_EXPOSURE_DEFAULT 66 /* 33 ms / 30 fps */ #define PAC7302_EXPOSURE_KNEE 133 /* 66 ms / 15 fps */ MODULE_AUTHOR("Jean-Francois Moine <http://moinejf.free.fr>, Thomas Kaiser thomas@kaiser-linux.li"); MODULE_DESCRIPTION("Pixart PAC7302"); MODULE_LICENSE("GPL"); struct sd { struct gspca_dev gspca_dev; /* !! must be the first item */ struct { /* brightness / contrast cluster */ struct v4l2_ctrl *brightness; struct v4l2_ctrl *contrast; }; struct v4l2_ctrl *saturation; struct v4l2_ctrl *white_balance; struct v4l2_ctrl *red_balance; struct v4l2_ctrl *blue_balance; struct { /* flip cluster */ struct v4l2_ctrl *hflip; struct v4l2_ctrl *vflip; }; struct v4l2_ctrl *sharpness; u8 flags; #define FL_HFLIP 0x01 /* mirrored by default */ #define FL_VFLIP 0x02 /* vertical flipped by default */ u8 sof_read; s8 autogain_ignore_frames; atomic_t avg_lum; }; static const struct v4l2_pix_format vga_mode[] = { {640, 480, V4L2_PIX_FMT_PJPG, V4L2_FIELD_NONE, .bytesperline = 640, .sizeimage = 640 * 480 * 3 / 8 + 590, .colorspace = V4L2_COLORSPACE_JPEG, }, }; #define LOAD_PAGE3 255 #define END_OF_SEQUENCE 0 static const u8 init_7302[] = { /* index,value */ 0xff, 0x01, /* page 1 */ 0x78, 0x00, /* deactivate */ 0xff, 0x01, 0x78, 0x40, /* led off */ }; static const u8 start_7302[] = { /* index, len, [value]* */ 0xff, 1, 0x00, /* page 0 */ 0x00, 12, 0x01, 0x40, 0x40, 0x40, 0x01, 0xe0, 0x02, 0x80, 0x00, 0x00, 0x00, 0x00, 0x0d, 24, 0x03, 0x01, 0x00, 0xb5, 0x07, 0xcb, 0x00, 0x00, 0x07, 0xc8, 0x00, 0xea, 0x07, 0xcf, 0x07, 0xf7, 0x07, 0x7e, 0x01, 0x0b, 0x00, 0x00, 0x00, 0x11, 0x26, 2, 0xaa, 0xaa, 0x2e, 1, 0x31, 0x38, 1, 0x01, 0x3a, 3, 0x14, 0xff, 0x5a, 0x43, 11, 0x00, 0x0a, 0x18, 0x11, 0x01, 0x2c, 0x88, 0x11, 0x00, 0x54, 0x11, 0x55, 1, 0x00, 0x62, 4, 0x10, 0x1e, 0x1e, 0x18, 0x6b, 1, 0x00, 0x6e, 3, 0x08, 0x06, 0x00, 0x72, 3, 0x00, 0xff, 0x00, 0x7d, 23, 0x01, 0x01, 0x58, 0x46, 0x50, 0x3c, 0x50, 0x3c, 0x54, 0x46, 0x54, 0x56, 0x52, 0x50, 0x52, 0x50, 0x56, 0x64, 0xa4, 0x00, 0xda, 0x00, 0x00, 0xa2, 10, 0x22, 0x2c, 0x3c, 0x54, 0x69, 0x7c, 0x9c, 0xb9, 0xd2, 0xeb, 0xaf, 1, 0x02, 0xb5, 2, 0x08, 0x08, 0xb8, 2, 0x08, 0x88, 0xc4, 4, 0xae, 0x01, 0x04, 0x01, 0xcc, 1, 0x00, 0xd1, 11, 0x01, 0x30, 0x49, 0x5e, 0x6f, 0x7f, 0x8e, 0xa9, 0xc1, 0xd7, 0xec, 0xdc, 1, 0x01, 0xff, 1, 0x01, /* page 1 */ 0x12, 3, 0x02, 0x00, 0x01, 0x3e, 2, 0x00, 0x00, 0x76, 5, 0x01, 0x20, 0x40, 0x00, 0xf2, 0x7c, 1, 0x00, 0x7f, 10, 0x4b, 0x0f, 0x01, 0x2c, 0x02, 0x58, 0x03, 0x20, 0x02, 0x00, 0x96, 5, 0x01, 0x10, 0x04, 0x01, 0x04, 0xc8, 14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x07, 0x00, 0x01, 0x07, 0x04, 0x01, 0xd8, 1, 0x01, 0xdb, 2, 0x00, 0x01, 0xde, 7, 0x00, 0x01, 0x04, 0x04, 0x00, 0x00, 0x00, 0xe6, 4, 0x00, 0x00, 0x00, 0x01, 0xeb, 1, 0x00, 0xff, 1, 0x02, /* page 2 */ 0x22, 1, 0x00, 0xff, 1, 0x03, /* page 3 */ 0, LOAD_PAGE3, /* load the page 3 */ 0x11, 1, 0x01, 0xff, 1, 0x02, /* page 2 */ 0x13, 1, 0x00, 0x22, 4, 0x1f, 0xa4, 0xf0, 0x96, 0x27, 2, 0x14, 0x0c, 0x2a, 5, 0xc8, 0x00, 0x18, 0x12, 0x22, 0x64, 8, 0x00, 0x00, 0xf0, 0x01, 0x14, 0x44, 0x44, 0x44, 0x6e, 1, 0x08, 0xff, 1, 0x01, /* page 1 */ 0x78, 1, 0x00, 0, END_OF_SEQUENCE /* end of sequence */ }; #define SKIP 0xaa /* page 3 - the value SKIP says skip the index - see reg_w_page() */ static const u8 page3_7302[] = { 0x90, 0x40, 0x03, 0x00, 0xc0, 0x01, 0x14, 0x16, 0x14, 0x12, 0x00, 0x00, 0x00, 0x02, 0x33, 0x00, 0x0f, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x01, 0xb3, 0x01, 0x00, 0x00, 0x08, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x54, 0xf4, 0x02, 0x52, 0x54, 0xa4, 0xb8, 0xe0, 0x2a, 0xf6, 0x00, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x00, 0xf2, 0x1f, 0x04, 0x00, 0x00, SKIP, 0x00, 0x00, 0xc0, 0xc0, 0x10, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0xff, 0x03, 0x19, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc8, 0xc8, 0xc8, 0xc8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x50, 0x08, 0x10, 0x24, 0x40, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x02, 0x47, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xfa, 0x00, 0x64, 0x5a, 0x28, 0x00, 0x00 }; static void reg_w_buf(struct gspca_dev *gspca_dev, u8 index, const u8 *buffer, int len) { int ret; if (gspca_dev->usb_err < 0) return; memcpy(gspca_dev->usb_buf, buffer, len); ret = usb_control_msg(gspca_dev->dev, usb_sndctrlpipe(gspca_dev->dev, 0), 0, /* request */ USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0, /* value */ index, gspca_dev->usb_buf, len, 500); if (ret < 0) { pr_err("reg_w_buf failed i: %02x error %d\n", index, ret); gspca_dev->usb_err = ret; } } static void reg_w(struct gspca_dev *gspca_dev, u8 index, u8 value) { int ret; if (gspca_dev->usb_err < 0) return; gspca_dev->usb_buf[0] = value; ret = usb_control_msg(gspca_dev->dev, usb_sndctrlpipe(gspca_dev->dev, 0), 0, /* request */ USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0, index, gspca_dev->usb_buf, 1, 500); if (ret < 0) { pr_err("reg_w() failed i: %02x v: %02x error %d\n", index, value, ret); gspca_dev->usb_err = ret; } } static void reg_w_seq(struct gspca_dev *gspca_dev, const u8 *seq, int len) { while (--len >= 0) { reg_w(gspca_dev, seq[0], seq[1]); seq += 2; } } /* load the beginning of a page */ static void reg_w_page(struct gspca_dev *gspca_dev, const u8 *page, int len) { int index; int ret = 0; if (gspca_dev->usb_err < 0) return; for (index = 0; index < len; index++) { if (page[index] == SKIP) /* skip this index */ continue; gspca_dev->usb_buf[0] = page[index]; ret = usb_control_msg(gspca_dev->dev, usb_sndctrlpipe(gspca_dev->dev, 0), 0, /* request */ USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0, index, gspca_dev->usb_buf, 1, 500); if (ret < 0) { pr_err("reg_w_page() failed i: %02x v: %02x error %d\n", index, page[index], ret); gspca_dev->usb_err = ret; break; } } } /* output a variable sequence */ static void reg_w_var(struct gspca_dev *gspca_dev, const u8 *seq, const u8 *page3, unsigned int page3_len) { int index, len; for (;;) { index = *seq++; len = *seq++; switch (len) { case END_OF_SEQUENCE: return; case LOAD_PAGE3: reg_w_page(gspca_dev, page3, page3_len); break; default: if (len > USB_BUF_SZ) { gspca_err(gspca_dev, "Incorrect variable sequence\n"); return; } while (len > 0) { if (len < 8) { reg_w_buf(gspca_dev, index, seq, len); seq += len; break; } reg_w_buf(gspca_dev, index, seq, 8); seq += 8; index += 8; len -= 8; } } } /* not reached */ } /* this function is called at probe time for pac7302 */ static int sd_config(struct gspca_dev *gspca_dev, const struct usb_device_id *id) { struct sd *sd = (struct sd *) gspca_dev; struct cam *cam; cam = &gspca_dev->cam; cam->cam_mode = vga_mode; /* only 640x480 */ cam->nmodes = ARRAY_SIZE(vga_mode); sd->flags = id->driver_info; return 0; } static void setbrightcont(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; int i, v; static const u8 max[10] = {0x29, 0x33, 0x42, 0x5a, 0x6e, 0x80, 0x9f, 0xbb, 0xd4, 0xec}; static const u8 delta[10] = {0x35, 0x33, 0x33, 0x2f, 0x2a, 0x25, 0x1e, 0x17, 0x11, 0x0b}; reg_w(gspca_dev, 0xff, 0x00); /* page 0 */ for (i = 0; i < 10; i++) { v = max[i]; v += (sd->brightness->val - (s32)sd->brightness->maximum) * 150 / (s32)sd->brightness->maximum; /* 200 ? */ v -= delta[i] * sd->contrast->val / (s32)sd->contrast->maximum; if (v < 0) v = 0; else if (v > 0xff) v = 0xff; reg_w(gspca_dev, 0xa2 + i, v); } reg_w(gspca_dev, 0xdc, 0x01); } static void setcolors(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; int i, v; static const int a[9] = {217, -212, 0, -101, 170, -67, -38, -315, 355}; static const int b[9] = {19, 106, 0, 19, 106, 1, 19, 106, 1}; reg_w(gspca_dev, 0xff, 0x03); /* page 3 */ reg_w(gspca_dev, 0x11, 0x01); reg_w(gspca_dev, 0xff, 0x00); /* page 0 */ for (i = 0; i < 9; i++) { v = a[i] * sd->saturation->val / (s32)sd->saturation->maximum; v += b[i]; reg_w(gspca_dev, 0x0f + 2 * i, (v >> 8) & 0x07); reg_w(gspca_dev, 0x0f + 2 * i + 1, v); } reg_w(gspca_dev, 0xdc, 0x01); } static void setwhitebalance(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; reg_w(gspca_dev, 0xff, 0x00); /* page 0 */ reg_w(gspca_dev, 0xc6, sd->white_balance->val); reg_w(gspca_dev, 0xdc, 0x01); } static u8 rgbbalance_ctrl_to_reg_value(s32 rgb_ctrl_val) { const unsigned int k = 1000; /* precision factor */ unsigned int norm; /* Normed value [0...k] */ norm = k * (rgb_ctrl_val - PAC7302_RGB_BALANCE_MIN) / (PAC7302_RGB_BALANCE_MAX - PAC7302_RGB_BALANCE_MIN); /* Qudratic apporach improves control at small (register) values: */ return 64 * norm * norm / (k*k) + 32 * norm / k + 32; /* Y = 64*X*X + 32*X + 32 * => register values 0x20-0x80; Windows driver uses these limits */ /* NOTE: for full value range (0x00-0xff) use * Y = 254*X*X + X * => 254 * norm * norm / (k*k) + 1 * norm / k */ } static void setredbalance(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; reg_w(gspca_dev, 0xff, 0x00); /* page 0 */ reg_w(gspca_dev, 0x01, rgbbalance_ctrl_to_reg_value(sd->red_balance->val)); reg_w(gspca_dev, 0xdc, 0x01); } static void setbluebalance(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; reg_w(gspca_dev, 0xff, 0x00); /* page 0 */ reg_w(gspca_dev, 0x03, rgbbalance_ctrl_to_reg_value(sd->blue_balance->val)); reg_w(gspca_dev, 0xdc, 0x01); } static void setgain(struct gspca_dev *gspca_dev) { u8 reg10, reg12; if (gspca_dev->gain->val < 32) { reg10 = gspca_dev->gain->val; reg12 = 0; } else { reg10 = 31; reg12 = gspca_dev->gain->val - 31; } reg_w(gspca_dev, 0xff, 0x03); /* page 3 */ reg_w(gspca_dev, 0x10, reg10); reg_w(gspca_dev, 0x12, reg12); /* load registers to sensor (Bit 0, auto clear) */ reg_w(gspca_dev, 0x11, 0x01); } static void setexposure(struct gspca_dev *gspca_dev) { u8 clockdiv; u16 exposure; /* * Register 2 of frame 3 contains the clock divider configuring the * no fps according to the formula: 90 / reg. sd->exposure is the * desired exposure time in 0.5 ms. */ clockdiv = (90 * gspca_dev->exposure->val + 1999) / 2000; /* * Note clockdiv = 3 also works, but when running at 30 fps, depending * on the scene being recorded, the camera switches to another * quantization table for certain JPEG blocks, and we don't know how * to decompress these blocks. So we cap the framerate at 15 fps. */ if (clockdiv < 6) clockdiv = 6; else if (clockdiv > 63) clockdiv = 63; /* * Register 2 MUST be a multiple of 3, except when between 6 and 12? * Always round up, otherwise we cannot get the desired frametime * using the partial frame time exposure control. */ if (clockdiv < 6 || clockdiv > 12) clockdiv = ((clockdiv + 2) / 3) * 3; /* * frame exposure time in ms = 1000 * clockdiv / 90 -> * exposure = (sd->exposure / 2) * 448 / (1000 * clockdiv / 90) */ exposure = (gspca_dev->exposure->val * 45 * 448) / (1000 * clockdiv); /* 0 = use full frametime, 448 = no exposure, reverse it */ exposure = 448 - exposure; reg_w(gspca_dev, 0xff, 0x03); /* page 3 */ reg_w(gspca_dev, 0x02, clockdiv); reg_w(gspca_dev, 0x0e, exposure & 0xff); reg_w(gspca_dev, 0x0f, exposure >> 8); /* load registers to sensor (Bit 0, auto clear) */ reg_w(gspca_dev, 0x11, 0x01); } static void sethvflip(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; u8 data, hflip, vflip; hflip = sd->hflip->val; if (sd->flags & FL_HFLIP) hflip = !hflip; vflip = sd->vflip->val; if (sd->flags & FL_VFLIP) vflip = !vflip; reg_w(gspca_dev, 0xff, 0x03); /* page 3 */ data = (hflip ? 0x08 : 0x00) | (vflip ? 0x04 : 0x00); reg_w(gspca_dev, 0x21, data); /* load registers to sensor (Bit 0, auto clear) */ reg_w(gspca_dev, 0x11, 0x01); } static void setsharpness(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; reg_w(gspca_dev, 0xff, 0x00); /* page 0 */ reg_w(gspca_dev, 0xb6, sd->sharpness->val); reg_w(gspca_dev, 0xdc, 0x01); } /* this function is called at probe and resume time for pac7302 */ static int sd_init(struct gspca_dev *gspca_dev) { reg_w_seq(gspca_dev, init_7302, sizeof(init_7302)/2); return gspca_dev->usb_err; } static int sd_s_ctrl(struct v4l2_ctrl *ctrl) { struct gspca_dev *gspca_dev = container_of(ctrl->handler, struct gspca_dev, ctrl_handler); struct sd *sd = (struct sd *)gspca_dev; gspca_dev->usb_err = 0; if (ctrl->id == V4L2_CID_AUTOGAIN && ctrl->is_new && ctrl->val) { /* when switching to autogain set defaults to make sure we are on a valid point of the autogain gain / exposure knee graph, and give this change time to take effect before doing autogain. */ gspca_dev->exposure->val = PAC7302_EXPOSURE_DEFAULT; gspca_dev->gain->val = PAC7302_GAIN_DEFAULT; sd->autogain_ignore_frames = PAC_AUTOGAIN_IGNORE_FRAMES; } if (!gspca_dev->streaming) return 0; switch (ctrl->id) { case V4L2_CID_BRIGHTNESS: setbrightcont(gspca_dev); break; case V4L2_CID_SATURATION: setcolors(gspca_dev); break; case V4L2_CID_WHITE_BALANCE_TEMPERATURE: setwhitebalance(gspca_dev); break; case V4L2_CID_RED_BALANCE: setredbalance(gspca_dev); break; case V4L2_CID_BLUE_BALANCE: setbluebalance(gspca_dev); break; case V4L2_CID_AUTOGAIN: if (gspca_dev->exposure->is_new || (ctrl->is_new && ctrl->val)) setexposure(gspca_dev); if (gspca_dev->gain->is_new || (ctrl->is_new && ctrl->val)) setgain(gspca_dev); break; case V4L2_CID_HFLIP: sethvflip(gspca_dev); break; case V4L2_CID_SHARPNESS: setsharpness(gspca_dev); break; default: return -EINVAL; } return gspca_dev->usb_err; } static const struct v4l2_ctrl_ops sd_ctrl_ops = { .s_ctrl = sd_s_ctrl, }; /* this function is called at probe time */ static int sd_init_controls(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; struct v4l2_ctrl_handler *hdl = &gspca_dev->ctrl_handler; gspca_dev->vdev.ctrl_handler = hdl; v4l2_ctrl_handler_init(hdl, 12); sd->brightness = v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_BRIGHTNESS, 0, 32, 1, 16); sd->contrast = v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_CONTRAST, 0, 255, 1, 127); sd->saturation = v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_SATURATION, 0, 255, 1, 127); sd->white_balance = v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_WHITE_BALANCE_TEMPERATURE, 0, 255, 1, 55); sd->red_balance = v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_RED_BALANCE, PAC7302_RGB_BALANCE_MIN, PAC7302_RGB_BALANCE_MAX, 1, PAC7302_RGB_BALANCE_DEFAULT); sd->blue_balance = v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_BLUE_BALANCE, PAC7302_RGB_BALANCE_MIN, PAC7302_RGB_BALANCE_MAX, 1, PAC7302_RGB_BALANCE_DEFAULT); gspca_dev->autogain = v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_AUTOGAIN, 0, 1, 1, 1); gspca_dev->exposure = v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_EXPOSURE, 0, 1023, 1, PAC7302_EXPOSURE_DEFAULT); gspca_dev->gain = v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_GAIN, 0, 62, 1, PAC7302_GAIN_DEFAULT); sd->hflip = v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_HFLIP, 0, 1, 1, 0); sd->vflip = v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_VFLIP, 0, 1, 1, 0); sd->sharpness = v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_SHARPNESS, 0, 15, 1, 8); if (hdl->error) { pr_err("Could not initialize controls\n"); return hdl->error; } v4l2_ctrl_cluster(2, &sd->brightness); v4l2_ctrl_auto_cluster(3, &gspca_dev->autogain, 0, false); v4l2_ctrl_cluster(2, &sd->hflip); return 0; } /* -- start the camera -- */ static int sd_start(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; reg_w_var(gspca_dev, start_7302, page3_7302, sizeof(page3_7302)); sd->sof_read = 0; sd->autogain_ignore_frames = 0; atomic_set(&sd->avg_lum, 270 + sd->brightness->val); /* start stream */ reg_w(gspca_dev, 0xff, 0x01); reg_w(gspca_dev, 0x78, 0x01); return gspca_dev->usb_err; } static void sd_stopN(struct gspca_dev *gspca_dev) { /* stop stream */ reg_w(gspca_dev, 0xff, 0x01); reg_w(gspca_dev, 0x78, 0x00); } /* called on streamoff with alt 0 and on disconnect for pac7302 */ static void sd_stop0(struct gspca_dev *gspca_dev) { if (!gspca_dev->present) return; reg_w(gspca_dev, 0xff, 0x01); reg_w(gspca_dev, 0x78, 0x40); } static void do_autogain(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; int avg_lum = atomic_read(&sd->avg_lum); int desired_lum; const int deadzone = 30; if (sd->autogain_ignore_frames < 0) return; if (sd->autogain_ignore_frames > 0) { sd->autogain_ignore_frames--; } else { desired_lum = 270 + sd->brightness->val; if (gspca_expo_autogain(gspca_dev, avg_lum, desired_lum, deadzone, PAC7302_GAIN_KNEE, PAC7302_EXPOSURE_KNEE)) sd->autogain_ignore_frames = PAC_AUTOGAIN_IGNORE_FRAMES; } } /* JPEG header */ static const u8 jpeg_header[] = { 0xff, 0xd8, /* SOI: Start of Image */ 0xff, 0xc0, /* SOF0: Start of Frame (Baseline DCT) */ 0x00, 0x11, /* length = 17 bytes (including this length field) */ 0x08, /* Precision: 8 */ 0x02, 0x80, /* height = 640 (image rotated) */ 0x01, 0xe0, /* width = 480 */ 0x03, /* Number of image components: 3 */ 0x01, 0x21, 0x00, /* ID=1, Subsampling 1x1, Quantization table: 0 */ 0x02, 0x11, 0x01, /* ID=2, Subsampling 2x1, Quantization table: 1 */ 0x03, 0x11, 0x01, /* ID=3, Subsampling 2x1, Quantization table: 1 */ 0xff, 0xda, /* SOS: Start Of Scan */ 0x00, 0x0c, /* length = 12 bytes (including this length field) */ 0x03, /* number of components: 3 */ 0x01, 0x00, /* selector 1, table 0x00 */ 0x02, 0x11, /* selector 2, table 0x11 */ 0x03, 0x11, /* selector 3, table 0x11 */ 0x00, 0x3f, /* Spectral selection: 0 .. 63 */ 0x00 /* Successive approximation: 0 */ }; /* this function is run at interrupt level */ static void sd_pkt_scan(struct gspca_dev *gspca_dev, u8 *data, /* isoc packet */ int len) /* iso packet length */ { struct sd *sd = (struct sd *) gspca_dev; u8 *image; u8 *sof; sof = pac_find_sof(gspca_dev, &sd->sof_read, data, len); if (sof) { int n, lum_offset, footer_length; /* * 6 bytes after the FF D9 EOF marker a number of lumination * bytes are send corresponding to different parts of the * image, the 14th and 15th byte after the EOF seem to * correspond to the center of the image. */ lum_offset = 61 + sizeof pac_sof_marker; footer_length = 74; /* Finish decoding current frame */ n = (sof - data) - (footer_length + sizeof pac_sof_marker); if (n < 0) { gspca_dev->image_len += n; } else { gspca_frame_add(gspca_dev, INTER_PACKET, data, n); } image = gspca_dev->image; if (image != NULL && image[gspca_dev->image_len - 2] == 0xff && image[gspca_dev->image_len - 1] == 0xd9) gspca_frame_add(gspca_dev, LAST_PACKET, NULL, 0); n = sof - data; len -= n; data = sof; /* Get average lumination */ if (gspca_dev->last_packet_type == LAST_PACKET && n >= lum_offset) atomic_set(&sd->avg_lum, data[-lum_offset] + data[-lum_offset + 1]); /* Start the new frame with the jpeg header */ /* The PAC7302 has the image rotated 90 degrees */ gspca_frame_add(gspca_dev, FIRST_PACKET, jpeg_header, sizeof jpeg_header); } gspca_frame_add(gspca_dev, INTER_PACKET, data, len); } #ifdef CONFIG_VIDEO_ADV_DEBUG static int sd_dbg_s_register(struct gspca_dev *gspca_dev, const struct v4l2_dbg_register *reg) { u8 index; u8 value; /* * reg->reg: bit0..15: reserved for register index (wIndex is 16bit * long on the USB bus) */ if (reg->match.addr == 0 && (reg->reg < 0x000000ff) && (reg->val <= 0x000000ff) ) { /* Currently writing to page 0 is only supported. */ /* reg_w() only supports 8bit index */ index = reg->reg; value = reg->val; /* * Note that there shall be no access to other page * by any other function between the page switch and * the actual register write. */ reg_w(gspca_dev, 0xff, 0x00); /* page 0 */ reg_w(gspca_dev, index, value); reg_w(gspca_dev, 0xdc, 0x01); } return gspca_dev->usb_err; } #endif #if IS_ENABLED(CONFIG_INPUT) static int sd_int_pkt_scan(struct gspca_dev *gspca_dev, u8 *data, /* interrupt packet data */ int len) /* interrupt packet length */ { int ret = -EINVAL; u8 data0, data1; if (len == 2) { data0 = data[0]; data1 = data[1]; if ((data0 == 0x00 && data1 == 0x11) || (data0 == 0x22 && data1 == 0x33) || (data0 == 0x44 && data1 == 0x55) || (data0 == 0x66 && data1 == 0x77) || (data0 == 0x88 && data1 == 0x99) || (data0 == 0xaa && data1 == 0xbb) || (data0 == 0xcc && data1 == 0xdd) || (data0 == 0xee && data1 == 0xff)) { input_report_key(gspca_dev->input_dev, KEY_CAMERA, 1); input_sync(gspca_dev->input_dev); input_report_key(gspca_dev->input_dev, KEY_CAMERA, 0); input_sync(gspca_dev->input_dev); ret = 0; } } return ret; } #endif /* sub-driver description for pac7302 */ static const struct sd_desc sd_desc = { .name = KBUILD_MODNAME, .config = sd_config, .init = sd_init, .init_controls = sd_init_controls, .start = sd_start, .stopN = sd_stopN, .stop0 = sd_stop0, .pkt_scan = sd_pkt_scan, .dq_callback = do_autogain, #ifdef CONFIG_VIDEO_ADV_DEBUG .set_register = sd_dbg_s_register, #endif #if IS_ENABLED(CONFIG_INPUT) .int_pkt_scan = sd_int_pkt_scan, #endif }; /* -- module initialisation -- */ static const struct usb_device_id device_table[] = { {USB_DEVICE(0x06f8, 0x3009)}, {USB_DEVICE(0x06f8, 0x301b)}, {USB_DEVICE(0x093a, 0x2620)}, {USB_DEVICE(0x093a, 0x2621)}, {USB_DEVICE(0x093a, 0x2622), .driver_info = FL_VFLIP}, {USB_DEVICE(0x093a, 0x2623), .driver_info = FL_VFLIP}, {USB_DEVICE(0x093a, 0x2624), .driver_info = FL_VFLIP}, {USB_DEVICE(0x093a, 0x2625)}, {USB_DEVICE(0x093a, 0x2626)}, {USB_DEVICE(0x093a, 0x2627), .driver_info = FL_VFLIP}, {USB_DEVICE(0x093a, 0x2628)}, {USB_DEVICE(0x093a, 0x2629), .driver_info = FL_VFLIP}, {USB_DEVICE(0x093a, 0x262a)}, {USB_DEVICE(0x093a, 0x262c)}, {USB_DEVICE(0x145f, 0x013c)}, {USB_DEVICE(0x1ae7, 0x2001)}, /* SpeedLink Snappy Mic SL-6825-SBK */ {} }; MODULE_DEVICE_TABLE(usb, device_table); /* -- device connect -- */ static int sd_probe(struct usb_interface *intf, const struct usb_device_id *id) { return gspca_dev_probe(intf, id, &sd_desc, sizeof(struct sd), THIS_MODULE); } static struct usb_driver sd_driver = { .name = KBUILD_MODNAME, .id_table = device_table, .probe = sd_probe, .disconnect = gspca_disconnect, #ifdef CONFIG_PM .suspend = gspca_suspend, .resume = gspca_resume, .reset_resume = gspca_resume, #endif }; module_usb_driver(sd_driver); |
45 2 43 27 27 39 43 45 37 11 3 44 37 7 1 6 20 2 8 36 1 34 2 2 36 36 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 | // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/skbuff.h> #include <linux/sctp.h> #include <net/gso.h> #include <net/gro.h> /** * skb_eth_gso_segment - segmentation handler for ethernet protocols. * @skb: buffer to segment * @features: features for the output path (see dev->features) * @type: Ethernet Protocol ID */ struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, netdev_features_t features, __be16 type) { struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); struct packet_offload *ptype; rcu_read_lock(); list_for_each_entry_rcu(ptype, &net_hotdata.offload_base, list) { if (ptype->type == type && ptype->callbacks.gso_segment) { segs = ptype->callbacks.gso_segment(skb, features); break; } } rcu_read_unlock(); return segs; } EXPORT_SYMBOL(skb_eth_gso_segment); /** * skb_mac_gso_segment - mac layer segmentation handler. * @skb: buffer to segment * @features: features for the output path (see dev->features) */ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); struct packet_offload *ptype; int vlan_depth = skb->mac_len; __be16 type = skb_network_protocol(skb, &vlan_depth); if (unlikely(!type)) return ERR_PTR(-EINVAL); __skb_pull(skb, vlan_depth); rcu_read_lock(); list_for_each_entry_rcu(ptype, &net_hotdata.offload_base, list) { if (ptype->type == type && ptype->callbacks.gso_segment) { segs = ptype->callbacks.gso_segment(skb, features); break; } } rcu_read_unlock(); __skb_push(skb, skb->data - skb_mac_header(skb)); return segs; } EXPORT_SYMBOL(skb_mac_gso_segment); /* openvswitch calls this on rx path, so we need a different check. */ static bool skb_needs_check(const struct sk_buff *skb, bool tx_path) { if (tx_path) return skb->ip_summed != CHECKSUM_PARTIAL && skb->ip_summed != CHECKSUM_UNNECESSARY; return skb->ip_summed == CHECKSUM_NONE; } /** * __skb_gso_segment - Perform segmentation on skb. * @skb: buffer to segment * @features: features for the output path (see dev->features) * @tx_path: whether it is called in TX path * * This function segments the given skb and returns a list of segments. * * It may return NULL if the skb requires no segmentation. This is * only possible when GSO is used for verifying header integrity. * * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb. */ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path) { struct sk_buff *segs; if (unlikely(skb_needs_check(skb, tx_path))) { int err; /* We're going to init ->check field in TCP or UDP header */ err = skb_cow_head(skb, 0); if (err < 0) return ERR_PTR(err); } /* Only report GSO partial support if it will enable us to * support segmentation on this frame without needing additional * work. */ if (features & NETIF_F_GSO_PARTIAL) { netdev_features_t partial_features = NETIF_F_GSO_ROBUST; struct net_device *dev = skb->dev; partial_features |= dev->features & dev->gso_partial_features; if (!skb_gso_ok(skb, features | partial_features)) features &= ~NETIF_F_GSO_PARTIAL; } BUILD_BUG_ON(SKB_GSO_CB_OFFSET + sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); SKB_GSO_CB(skb)->encap_level = 0; skb_reset_mac_header(skb); skb_reset_mac_len(skb); segs = skb_mac_gso_segment(skb, features); if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) skb_warn_bad_offload(skb); return segs; } EXPORT_SYMBOL(__skb_gso_segment); /** * skb_gso_transport_seglen - Return length of individual segments of a gso packet * * @skb: GSO skb * * skb_gso_transport_seglen is used to determine the real size of the * individual segments, including Layer4 headers (TCP/UDP). * * The MAC/L2 or network (IP, IPv6) headers are not accounted for. */ static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) { const struct skb_shared_info *shinfo = skb_shinfo(skb); unsigned int thlen = 0; if (skb->encapsulation) { thlen = skb_inner_transport_header(skb) - skb_transport_header(skb); if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) thlen += inner_tcp_hdrlen(skb); } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { thlen = tcp_hdrlen(skb); } else if (unlikely(skb_is_gso_sctp(skb))) { thlen = sizeof(struct sctphdr); } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { thlen = sizeof(struct udphdr); } /* UFO sets gso_size to the size of the fragmentation * payload, i.e. the size of the L4 (UDP) header is already * accounted for. */ return thlen + shinfo->gso_size; } /** * skb_gso_network_seglen - Return length of individual segments of a gso packet * * @skb: GSO skb * * skb_gso_network_seglen is used to determine the real size of the * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP). * * The MAC/L2 header is not accounted for. */ static unsigned int skb_gso_network_seglen(const struct sk_buff *skb) { unsigned int hdr_len = skb_transport_header(skb) - skb_network_header(skb); return hdr_len + skb_gso_transport_seglen(skb); } /** * skb_gso_mac_seglen - Return length of individual segments of a gso packet * * @skb: GSO skb * * skb_gso_mac_seglen is used to determine the real size of the * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4 * headers (TCP/UDP). */ static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) { unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); return hdr_len + skb_gso_transport_seglen(skb); } /** * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS * * There are a couple of instances where we have a GSO skb, and we * want to determine what size it would be after it is segmented. * * We might want to check: * - L3+L4+payload size (e.g. IP forwarding) * - L2+L3+L4+payload size (e.g. sanity check before passing to driver) * * This is a helper to do that correctly considering GSO_BY_FRAGS. * * @skb: GSO skb * * @seg_len: The segmented length (from skb_gso_*_seglen). In the * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS]. * * @max_len: The maximum permissible length. * * Returns true if the segmented length <= max length. */ static inline bool skb_gso_size_check(const struct sk_buff *skb, unsigned int seg_len, unsigned int max_len) { const struct skb_shared_info *shinfo = skb_shinfo(skb); const struct sk_buff *iter; if (shinfo->gso_size != GSO_BY_FRAGS) return seg_len <= max_len; /* Undo this so we can re-use header sizes */ seg_len -= GSO_BY_FRAGS; skb_walk_frags(skb, iter) { if (seg_len + skb_headlen(iter) > max_len) return false; } return true; } /** * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU? * * @skb: GSO skb * @mtu: MTU to validate against * * skb_gso_validate_network_len validates if a given skb will fit a * wanted MTU once split. It considers L3 headers, L4 headers, and the * payload. */ bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu) { return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu); } EXPORT_SYMBOL_GPL(skb_gso_validate_network_len); /** * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length? * * @skb: GSO skb * @len: length to validate against * * skb_gso_validate_mac_len validates if a given skb will fit a wanted * length once split, including L2, L3 and L4 headers and the payload. */ bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len) { return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len); } EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len); |
34 27 33 1 3 8 7 27 7 7 8 1 2 5 4 4 1 3 3 7 1 1 1 1 1 1 1 17 15 4 4 4 94 91 16 7 89 82 6 2 8 8 8 5 3 4 4 2 6 1 6 1 7 7 7 7 7 2 4 1 1 1 37 18 11 13 37 37 22 22 2 2 1 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 | // SPDX-License-Identifier: GPL-2.0-or-later /* * USB Network driver infrastructure * Copyright (C) 2000-2005 by David Brownell * Copyright (C) 2003-2005 David Hollis <dhollis@davehollis.com> */ /* * This is a generic "USB networking" framework that works with several * kinds of full and high speed networking devices: host-to-host cables, * smart usb peripherals, and actual Ethernet adapters. * * These devices usually differ in terms of control protocols (if they * even have one!) and sometimes they define new framing to wrap or batch * Ethernet packets. Otherwise, they talk to USB pretty much the same, * so interface (un)binding, endpoint I/O queues, fault handling, and other * issues can usefully be addressed by this framework. */ #include <linux/module.h> #include <linux/init.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ctype.h> #include <linux/ethtool.h> #include <linux/workqueue.h> #include <linux/mii.h> #include <linux/usb.h> #include <linux/usb/usbnet.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/pm_runtime.h> /*-------------------------------------------------------------------------*/ /* * Nineteen USB 1.1 max size bulk transactions per frame (ms), max. * Several dozen bytes of IPv4 data can fit in two such transactions. * One maximum size Ethernet packet takes twenty four of them. * For high speed, each frame comfortably fits almost 36 max size * Ethernet packets (so queues should be bigger). * * The goal is to let the USB host controller be busy for 5msec or * more before an irq is required, under load. Jumbograms change * the equation. */ #define MAX_QUEUE_MEMORY (60 * 1518) #define RX_QLEN(dev) ((dev)->rx_qlen) #define TX_QLEN(dev) ((dev)->tx_qlen) // reawaken network queue this soon after stopping; else watchdog barks #define TX_TIMEOUT_JIFFIES (5*HZ) /* throttle rx/tx briefly after some faults, so hub_wq might disconnect() * us (it polls at HZ/4 usually) before we report too many false errors. */ #define THROTTLE_JIFFIES (HZ/8) // between wakeups #define UNLINK_TIMEOUT_MS 3 /*-------------------------------------------------------------------------*/ // randomly generated ethernet address static u8 node_id [ETH_ALEN]; /* use ethtool to change the level for any given device */ static int msg_level = -1; module_param (msg_level, int, 0); MODULE_PARM_DESC (msg_level, "Override default message level"); /*-------------------------------------------------------------------------*/ static const char * const usbnet_event_names[] = { [EVENT_TX_HALT] = "EVENT_TX_HALT", [EVENT_RX_HALT] = "EVENT_RX_HALT", [EVENT_RX_MEMORY] = "EVENT_RX_MEMORY", [EVENT_STS_SPLIT] = "EVENT_STS_SPLIT", [EVENT_LINK_RESET] = "EVENT_LINK_RESET", [EVENT_RX_PAUSED] = "EVENT_RX_PAUSED", [EVENT_DEV_ASLEEP] = "EVENT_DEV_ASLEEP", [EVENT_DEV_OPEN] = "EVENT_DEV_OPEN", [EVENT_DEVICE_REPORT_IDLE] = "EVENT_DEVICE_REPORT_IDLE", [EVENT_NO_RUNTIME_PM] = "EVENT_NO_RUNTIME_PM", [EVENT_RX_KILL] = "EVENT_RX_KILL", [EVENT_LINK_CHANGE] = "EVENT_LINK_CHANGE", [EVENT_SET_RX_MODE] = "EVENT_SET_RX_MODE", [EVENT_NO_IP_ALIGN] = "EVENT_NO_IP_ALIGN", }; /* handles CDC Ethernet and many other network "bulk data" interfaces */ int usbnet_get_endpoints(struct usbnet *dev, struct usb_interface *intf) { int tmp; struct usb_host_interface *alt = NULL; struct usb_host_endpoint *in = NULL, *out = NULL; struct usb_host_endpoint *status = NULL; for (tmp = 0; tmp < intf->num_altsetting; tmp++) { unsigned ep; in = out = status = NULL; alt = intf->altsetting + tmp; /* take the first altsetting with in-bulk + out-bulk; * remember any status endpoint, just in case; * ignore other endpoints and altsettings. */ for (ep = 0; ep < alt->desc.bNumEndpoints; ep++) { struct usb_host_endpoint *e; int intr = 0; e = alt->endpoint + ep; /* ignore endpoints which cannot transfer data */ if (!usb_endpoint_maxp(&e->desc)) continue; switch (e->desc.bmAttributes) { case USB_ENDPOINT_XFER_INT: if (!usb_endpoint_dir_in(&e->desc)) continue; intr = 1; fallthrough; case USB_ENDPOINT_XFER_BULK: break; default: continue; } if (usb_endpoint_dir_in(&e->desc)) { if (!intr && !in) in = e; else if (intr && !status) status = e; } else { if (!out) out = e; } } if (in && out) break; } if (!alt || !in || !out) return -EINVAL; if (alt->desc.bAlternateSetting != 0 || !(dev->driver_info->flags & FLAG_NO_SETINT)) { tmp = usb_set_interface (dev->udev, alt->desc.bInterfaceNumber, alt->desc.bAlternateSetting); if (tmp < 0) return tmp; } dev->in = usb_rcvbulkpipe (dev->udev, in->desc.bEndpointAddress & USB_ENDPOINT_NUMBER_MASK); dev->out = usb_sndbulkpipe (dev->udev, out->desc.bEndpointAddress & USB_ENDPOINT_NUMBER_MASK); dev->status = status; return 0; } EXPORT_SYMBOL_GPL(usbnet_get_endpoints); int usbnet_get_ethernet_addr(struct usbnet *dev, int iMACAddress) { u8 addr[ETH_ALEN]; int tmp = -1, ret; unsigned char buf [13]; ret = usb_string(dev->udev, iMACAddress, buf, sizeof buf); if (ret == 12) tmp = hex2bin(addr, buf, 6); if (tmp < 0) { dev_dbg(&dev->udev->dev, "bad MAC string %d fetch, %d\n", iMACAddress, tmp); if (ret >= 0) ret = -EINVAL; return ret; } eth_hw_addr_set(dev->net, addr); return 0; } EXPORT_SYMBOL_GPL(usbnet_get_ethernet_addr); static void intr_complete (struct urb *urb) { struct usbnet *dev = urb->context; int status = urb->status; switch (status) { /* success */ case 0: dev->driver_info->status(dev, urb); break; /* software-driven interface shutdown */ case -ENOENT: /* urb killed */ case -ESHUTDOWN: /* hardware gone */ netif_dbg(dev, ifdown, dev->net, "intr shutdown, code %d\n", status); return; /* NOTE: not throttling like RX/TX, since this endpoint * already polls infrequently */ default: netdev_dbg(dev->net, "intr status %d\n", status); break; } status = usb_submit_urb (urb, GFP_ATOMIC); if (status != 0) netif_err(dev, timer, dev->net, "intr resubmit --> %d\n", status); } static int init_status (struct usbnet *dev, struct usb_interface *intf) { char *buf = NULL; unsigned pipe = 0; unsigned maxp; unsigned period; if (!dev->driver_info->status) return 0; pipe = usb_rcvintpipe (dev->udev, dev->status->desc.bEndpointAddress & USB_ENDPOINT_NUMBER_MASK); maxp = usb_maxpacket(dev->udev, pipe); /* avoid 1 msec chatter: min 8 msec poll rate */ period = max ((int) dev->status->desc.bInterval, (dev->udev->speed == USB_SPEED_HIGH) ? 7 : 3); buf = kmalloc (maxp, GFP_KERNEL); if (buf) { dev->interrupt = usb_alloc_urb (0, GFP_KERNEL); if (!dev->interrupt) { kfree (buf); return -ENOMEM; } else { usb_fill_int_urb(dev->interrupt, dev->udev, pipe, buf, maxp, intr_complete, dev, period); dev->interrupt->transfer_flags |= URB_FREE_BUFFER; dev_dbg(&intf->dev, "status ep%din, %d bytes period %d\n", usb_pipeendpoint(pipe), maxp, period); } } return 0; } /* Submit the interrupt URB if not previously submitted, increasing refcount */ int usbnet_status_start(struct usbnet *dev, gfp_t mem_flags) { int ret = 0; WARN_ON_ONCE(dev->interrupt == NULL); if (dev->interrupt) { mutex_lock(&dev->interrupt_mutex); if (++dev->interrupt_count == 1) ret = usb_submit_urb(dev->interrupt, mem_flags); dev_dbg(&dev->udev->dev, "incremented interrupt URB count to %d\n", dev->interrupt_count); mutex_unlock(&dev->interrupt_mutex); } return ret; } EXPORT_SYMBOL_GPL(usbnet_status_start); /* For resume; submit interrupt URB if previously submitted */ static int __usbnet_status_start_force(struct usbnet *dev, gfp_t mem_flags) { int ret = 0; mutex_lock(&dev->interrupt_mutex); if (dev->interrupt_count) { ret = usb_submit_urb(dev->interrupt, mem_flags); dev_dbg(&dev->udev->dev, "submitted interrupt URB for resume\n"); } mutex_unlock(&dev->interrupt_mutex); return ret; } /* Kill the interrupt URB if all submitters want it killed */ void usbnet_status_stop(struct usbnet *dev) { if (dev->interrupt) { mutex_lock(&dev->interrupt_mutex); WARN_ON(dev->interrupt_count == 0); if (dev->interrupt_count && --dev->interrupt_count == 0) usb_kill_urb(dev->interrupt); dev_dbg(&dev->udev->dev, "decremented interrupt URB count to %d\n", dev->interrupt_count); mutex_unlock(&dev->interrupt_mutex); } } EXPORT_SYMBOL_GPL(usbnet_status_stop); /* For suspend; always kill interrupt URB */ static void __usbnet_status_stop_force(struct usbnet *dev) { if (dev->interrupt) { mutex_lock(&dev->interrupt_mutex); usb_kill_urb(dev->interrupt); dev_dbg(&dev->udev->dev, "killed interrupt URB for suspend\n"); mutex_unlock(&dev->interrupt_mutex); } } /* Passes this packet up the stack, updating its accounting. * Some link protocols batch packets, so their rx_fixup paths * can return clones as well as just modify the original skb. */ void usbnet_skb_return (struct usbnet *dev, struct sk_buff *skb) { struct pcpu_sw_netstats *stats64 = this_cpu_ptr(dev->net->tstats); unsigned long flags; int status; if (test_bit(EVENT_RX_PAUSED, &dev->flags)) { skb_queue_tail(&dev->rxq_pause, skb); return; } /* only update if unset to allow minidriver rx_fixup override */ if (skb->protocol == 0) skb->protocol = eth_type_trans (skb, dev->net); flags = u64_stats_update_begin_irqsave(&stats64->syncp); u64_stats_inc(&stats64->rx_packets); u64_stats_add(&stats64->rx_bytes, skb->len); u64_stats_update_end_irqrestore(&stats64->syncp, flags); netif_dbg(dev, rx_status, dev->net, "< rx, len %zu, type 0x%x\n", skb->len + sizeof (struct ethhdr), skb->protocol); memset (skb->cb, 0, sizeof (struct skb_data)); if (skb_defer_rx_timestamp(skb)) return; status = netif_rx (skb); if (status != NET_RX_SUCCESS) netif_dbg(dev, rx_err, dev->net, "netif_rx status %d\n", status); } EXPORT_SYMBOL_GPL(usbnet_skb_return); /* must be called if hard_mtu or rx_urb_size changed */ void usbnet_update_max_qlen(struct usbnet *dev) { enum usb_device_speed speed = dev->udev->speed; if (!dev->rx_urb_size || !dev->hard_mtu) goto insanity; switch (speed) { case USB_SPEED_HIGH: dev->rx_qlen = MAX_QUEUE_MEMORY / dev->rx_urb_size; dev->tx_qlen = MAX_QUEUE_MEMORY / dev->hard_mtu; break; case USB_SPEED_SUPER: case USB_SPEED_SUPER_PLUS: /* * Not take default 5ms qlen for super speed HC to * save memory, and iperf tests show 2.5ms qlen can * work well */ dev->rx_qlen = 5 * MAX_QUEUE_MEMORY / dev->rx_urb_size; dev->tx_qlen = 5 * MAX_QUEUE_MEMORY / dev->hard_mtu; break; default: insanity: dev->rx_qlen = dev->tx_qlen = 4; } } EXPORT_SYMBOL_GPL(usbnet_update_max_qlen); /*------------------------------------------------------------------------- * * Network Device Driver (peer link to "Host Device", from USB host) * *-------------------------------------------------------------------------*/ int usbnet_change_mtu (struct net_device *net, int new_mtu) { struct usbnet *dev = netdev_priv(net); int ll_mtu = new_mtu + net->hard_header_len; int old_hard_mtu = dev->hard_mtu; int old_rx_urb_size = dev->rx_urb_size; // no second zero-length packet read wanted after mtu-sized packets if ((ll_mtu % dev->maxpacket) == 0) return -EDOM; net->mtu = new_mtu; dev->hard_mtu = net->mtu + net->hard_header_len; if (dev->rx_urb_size == old_hard_mtu) { dev->rx_urb_size = dev->hard_mtu; if (dev->rx_urb_size > old_rx_urb_size) { usbnet_pause_rx(dev); usbnet_unlink_rx_urbs(dev); usbnet_resume_rx(dev); } } /* max qlen depend on hard_mtu and rx_urb_size */ usbnet_update_max_qlen(dev); return 0; } EXPORT_SYMBOL_GPL(usbnet_change_mtu); /* The caller must hold list->lock */ static void __usbnet_queue_skb(struct sk_buff_head *list, struct sk_buff *newsk, enum skb_state state) { struct skb_data *entry = (struct skb_data *) newsk->cb; __skb_queue_tail(list, newsk); entry->state = state; } /*-------------------------------------------------------------------------*/ /* some LK 2.4 HCDs oopsed if we freed or resubmitted urbs from * completion callbacks. 2.5 should have fixed those bugs... */ static enum skb_state defer_bh(struct usbnet *dev, struct sk_buff *skb, struct sk_buff_head *list, enum skb_state state) { unsigned long flags; enum skb_state old_state; struct skb_data *entry = (struct skb_data *) skb->cb; spin_lock_irqsave(&list->lock, flags); old_state = entry->state; entry->state = state; __skb_unlink(skb, list); /* defer_bh() is never called with list == &dev->done. * spin_lock_nested() tells lockdep that it is OK to take * dev->done.lock here with list->lock held. */ spin_lock_nested(&dev->done.lock, SINGLE_DEPTH_NESTING); __skb_queue_tail(&dev->done, skb); if (dev->done.qlen == 1) tasklet_schedule(&dev->bh); spin_unlock(&dev->done.lock); spin_unlock_irqrestore(&list->lock, flags); return old_state; } /* some work can't be done in tasklets, so we use keventd * * NOTE: annoying asymmetry: if it's active, schedule_work() fails, * but tasklet_schedule() doesn't. hope the failure is rare. */ void usbnet_defer_kevent (struct usbnet *dev, int work) { set_bit (work, &dev->flags); if (!schedule_work (&dev->kevent)) netdev_dbg(dev->net, "kevent %s may have been dropped\n", usbnet_event_names[work]); else netdev_dbg(dev->net, "kevent %s scheduled\n", usbnet_event_names[work]); } EXPORT_SYMBOL_GPL(usbnet_defer_kevent); /*-------------------------------------------------------------------------*/ static void rx_complete (struct urb *urb); static int rx_submit (struct usbnet *dev, struct urb *urb, gfp_t flags) { struct sk_buff *skb; struct skb_data *entry; int retval = 0; unsigned long lockflags; size_t size = dev->rx_urb_size; /* prevent rx skb allocation when error ratio is high */ if (test_bit(EVENT_RX_KILL, &dev->flags)) { usb_free_urb(urb); return -ENOLINK; } if (test_bit(EVENT_NO_IP_ALIGN, &dev->flags)) skb = __netdev_alloc_skb(dev->net, size, flags); else skb = __netdev_alloc_skb_ip_align(dev->net, size, flags); if (!skb) { netif_dbg(dev, rx_err, dev->net, "no rx skb\n"); usbnet_defer_kevent (dev, EVENT_RX_MEMORY); usb_free_urb (urb); return -ENOMEM; } entry = (struct skb_data *) skb->cb; entry->urb = urb; entry->dev = dev; entry->length = 0; usb_fill_bulk_urb (urb, dev->udev, dev->in, skb->data, size, rx_complete, skb); spin_lock_irqsave (&dev->rxq.lock, lockflags); if (netif_running (dev->net) && netif_device_present (dev->net) && test_bit(EVENT_DEV_OPEN, &dev->flags) && !test_bit (EVENT_RX_HALT, &dev->flags) && !test_bit (EVENT_DEV_ASLEEP, &dev->flags)) { switch (retval = usb_submit_urb (urb, GFP_ATOMIC)) { case -EPIPE: usbnet_defer_kevent (dev, EVENT_RX_HALT); break; case -ENOMEM: usbnet_defer_kevent (dev, EVENT_RX_MEMORY); break; case -ENODEV: netif_dbg(dev, ifdown, dev->net, "device gone\n"); netif_device_detach (dev->net); break; case -EHOSTUNREACH: retval = -ENOLINK; break; default: netif_dbg(dev, rx_err, dev->net, "rx submit, %d\n", retval); tasklet_schedule (&dev->bh); break; case 0: __usbnet_queue_skb(&dev->rxq, skb, rx_start); } } else { netif_dbg(dev, ifdown, dev->net, "rx: stopped\n"); retval = -ENOLINK; } spin_unlock_irqrestore (&dev->rxq.lock, lockflags); if (retval) { dev_kfree_skb_any (skb); usb_free_urb (urb); } return retval; } /*-------------------------------------------------------------------------*/ static inline int rx_process(struct usbnet *dev, struct sk_buff *skb) { if (dev->driver_info->rx_fixup && !dev->driver_info->rx_fixup (dev, skb)) { /* With RX_ASSEMBLE, rx_fixup() must update counters */ if (!(dev->driver_info->flags & FLAG_RX_ASSEMBLE)) dev->net->stats.rx_errors++; return -EPROTO; } // else network stack removes extra byte if we forced a short packet /* all data was already cloned from skb inside the driver */ if (dev->driver_info->flags & FLAG_MULTI_PACKET) return -EALREADY; if (skb->len < ETH_HLEN) { dev->net->stats.rx_errors++; dev->net->stats.rx_length_errors++; netif_dbg(dev, rx_err, dev->net, "rx length %d\n", skb->len); return -EPROTO; } usbnet_skb_return(dev, skb); return 0; } /*-------------------------------------------------------------------------*/ static void rx_complete (struct urb *urb) { struct sk_buff *skb = (struct sk_buff *) urb->context; struct skb_data *entry = (struct skb_data *) skb->cb; struct usbnet *dev = entry->dev; int urb_status = urb->status; enum skb_state state; skb_put (skb, urb->actual_length); state = rx_done; entry->urb = NULL; switch (urb_status) { /* success */ case 0: break; /* stalls need manual reset. this is rare ... except that * when going through USB 2.0 TTs, unplug appears this way. * we avoid the highspeed version of the ETIMEDOUT/EILSEQ * storm, recovering as needed. */ case -EPIPE: dev->net->stats.rx_errors++; usbnet_defer_kevent (dev, EVENT_RX_HALT); fallthrough; /* software-driven interface shutdown */ case -ECONNRESET: /* async unlink */ case -ESHUTDOWN: /* hardware gone */ netif_dbg(dev, ifdown, dev->net, "rx shutdown, code %d\n", urb_status); goto block; /* we get controller i/o faults during hub_wq disconnect() delays. * throttle down resubmits, to avoid log floods; just temporarily, * so we still recover when the fault isn't a hub_wq delay. */ case -EPROTO: case -ETIME: case -EILSEQ: dev->net->stats.rx_errors++; if (!timer_pending (&dev->delay)) { mod_timer (&dev->delay, jiffies + THROTTLE_JIFFIES); netif_dbg(dev, link, dev->net, "rx throttle %d\n", urb_status); } block: state = rx_cleanup; entry->urb = urb; urb = NULL; break; /* data overrun ... flush fifo? */ case -EOVERFLOW: dev->net->stats.rx_over_errors++; fallthrough; default: state = rx_cleanup; dev->net->stats.rx_errors++; netif_dbg(dev, rx_err, dev->net, "rx status %d\n", urb_status); break; } /* stop rx if packet error rate is high */ if (++dev->pkt_cnt > 30) { dev->pkt_cnt = 0; dev->pkt_err = 0; } else { if (state == rx_cleanup) dev->pkt_err++; if (dev->pkt_err > 20) set_bit(EVENT_RX_KILL, &dev->flags); } state = defer_bh(dev, skb, &dev->rxq, state); if (urb) { if (netif_running (dev->net) && !test_bit (EVENT_RX_HALT, &dev->flags) && state != unlink_start) { rx_submit (dev, urb, GFP_ATOMIC); usb_mark_last_busy(dev->udev); return; } usb_free_urb (urb); } netif_dbg(dev, rx_err, dev->net, "no read resubmitted\n"); } /*-------------------------------------------------------------------------*/ void usbnet_pause_rx(struct usbnet *dev) { set_bit(EVENT_RX_PAUSED, &dev->flags); netif_dbg(dev, rx_status, dev->net, "paused rx queue enabled\n"); } EXPORT_SYMBOL_GPL(usbnet_pause_rx); void usbnet_resume_rx(struct usbnet *dev) { struct sk_buff *skb; int num = 0; clear_bit(EVENT_RX_PAUSED, &dev->flags); while ((skb = skb_dequeue(&dev->rxq_pause)) != NULL) { usbnet_skb_return(dev, skb); num++; } tasklet_schedule(&dev->bh); netif_dbg(dev, rx_status, dev->net, "paused rx queue disabled, %d skbs requeued\n", num); } EXPORT_SYMBOL_GPL(usbnet_resume_rx); void usbnet_purge_paused_rxq(struct usbnet *dev) { skb_queue_purge(&dev->rxq_pause); } EXPORT_SYMBOL_GPL(usbnet_purge_paused_rxq); /*-------------------------------------------------------------------------*/ // unlink pending rx/tx; completion handlers do all other cleanup static int unlink_urbs (struct usbnet *dev, struct sk_buff_head *q) { unsigned long flags; struct sk_buff *skb; int count = 0; spin_lock_irqsave (&q->lock, flags); while (!skb_queue_empty(q)) { struct skb_data *entry; struct urb *urb; int retval; skb_queue_walk(q, skb) { entry = (struct skb_data *) skb->cb; if (entry->state != unlink_start) goto found; } break; found: entry->state = unlink_start; urb = entry->urb; /* * Get reference count of the URB to avoid it to be * freed during usb_unlink_urb, which may trigger * use-after-free problem inside usb_unlink_urb since * usb_unlink_urb is always racing with .complete * handler(include defer_bh). */ usb_get_urb(urb); spin_unlock_irqrestore(&q->lock, flags); // during some PM-driven resume scenarios, // these (async) unlinks complete immediately retval = usb_unlink_urb (urb); if (retval != -EINPROGRESS && retval != 0) netdev_dbg(dev->net, "unlink urb err, %d\n", retval); else count++; usb_put_urb(urb); spin_lock_irqsave(&q->lock, flags); } spin_unlock_irqrestore (&q->lock, flags); return count; } // Flush all pending rx urbs // minidrivers may need to do this when the MTU changes void usbnet_unlink_rx_urbs(struct usbnet *dev) { if (netif_running(dev->net)) { (void) unlink_urbs (dev, &dev->rxq); tasklet_schedule(&dev->bh); } } EXPORT_SYMBOL_GPL(usbnet_unlink_rx_urbs); /*-------------------------------------------------------------------------*/ static void wait_skb_queue_empty(struct sk_buff_head *q) { unsigned long flags; spin_lock_irqsave(&q->lock, flags); while (!skb_queue_empty(q)) { spin_unlock_irqrestore(&q->lock, flags); schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); set_current_state(TASK_UNINTERRUPTIBLE); spin_lock_irqsave(&q->lock, flags); } spin_unlock_irqrestore(&q->lock, flags); } // precondition: never called in_interrupt static void usbnet_terminate_urbs(struct usbnet *dev) { DECLARE_WAITQUEUE(wait, current); int temp; /* ensure there are no more active urbs */ add_wait_queue(&dev->wait, &wait); set_current_state(TASK_UNINTERRUPTIBLE); temp = unlink_urbs(dev, &dev->txq) + unlink_urbs(dev, &dev->rxq); /* maybe wait for deletions to finish. */ wait_skb_queue_empty(&dev->rxq); wait_skb_queue_empty(&dev->txq); wait_skb_queue_empty(&dev->done); netif_dbg(dev, ifdown, dev->net, "waited for %d urb completions\n", temp); set_current_state(TASK_RUNNING); remove_wait_queue(&dev->wait, &wait); } int usbnet_stop (struct net_device *net) { struct usbnet *dev = netdev_priv(net); const struct driver_info *info = dev->driver_info; int retval, pm, mpn; clear_bit(EVENT_DEV_OPEN, &dev->flags); netif_stop_queue (net); netif_info(dev, ifdown, dev->net, "stop stats: rx/tx %lu/%lu, errs %lu/%lu\n", net->stats.rx_packets, net->stats.tx_packets, net->stats.rx_errors, net->stats.tx_errors); /* to not race resume */ pm = usb_autopm_get_interface(dev->intf); /* allow minidriver to stop correctly (wireless devices to turn off * radio etc) */ if (info->stop) { retval = info->stop(dev); if (retval < 0) netif_info(dev, ifdown, dev->net, "stop fail (%d) usbnet usb-%s-%s, %s\n", retval, dev->udev->bus->bus_name, dev->udev->devpath, info->description); } if (!(info->flags & FLAG_AVOID_UNLINK_URBS)) usbnet_terminate_urbs(dev); usbnet_status_stop(dev); usbnet_purge_paused_rxq(dev); mpn = !test_and_clear_bit(EVENT_NO_RUNTIME_PM, &dev->flags); /* deferred work (timer, softirq, task) must also stop */ dev->flags = 0; del_timer_sync (&dev->delay); tasklet_kill (&dev->bh); cancel_work_sync(&dev->kevent); if (!pm) usb_autopm_put_interface(dev->intf); if (info->manage_power && mpn) info->manage_power(dev, 0); else usb_autopm_put_interface(dev->intf); return 0; } EXPORT_SYMBOL_GPL(usbnet_stop); /*-------------------------------------------------------------------------*/ // posts reads, and enables write queuing // precondition: never called in_interrupt int usbnet_open (struct net_device *net) { struct usbnet *dev = netdev_priv(net); int retval; const struct driver_info *info = dev->driver_info; if ((retval = usb_autopm_get_interface(dev->intf)) < 0) { netif_info(dev, ifup, dev->net, "resumption fail (%d) usbnet usb-%s-%s, %s\n", retval, dev->udev->bus->bus_name, dev->udev->devpath, info->description); goto done_nopm; } // put into "known safe" state if (info->reset && (retval = info->reset (dev)) < 0) { netif_info(dev, ifup, dev->net, "open reset fail (%d) usbnet usb-%s-%s, %s\n", retval, dev->udev->bus->bus_name, dev->udev->devpath, info->description); goto done; } /* hard_mtu or rx_urb_size may change in reset() */ usbnet_update_max_qlen(dev); // insist peer be connected if (info->check_connect && (retval = info->check_connect (dev)) < 0) { netif_err(dev, ifup, dev->net, "can't open; %d\n", retval); goto done; } /* start any status interrupt transfer */ if (dev->interrupt) { retval = usbnet_status_start(dev, GFP_KERNEL); if (retval < 0) { netif_err(dev, ifup, dev->net, "intr submit %d\n", retval); goto done; } } set_bit(EVENT_DEV_OPEN, &dev->flags); netif_start_queue (net); netif_info(dev, ifup, dev->net, "open: enable queueing (rx %d, tx %d) mtu %d %s framing\n", (int)RX_QLEN(dev), (int)TX_QLEN(dev), dev->net->mtu, (dev->driver_info->flags & FLAG_FRAMING_NC) ? "NetChip" : (dev->driver_info->flags & FLAG_FRAMING_GL) ? "GeneSys" : (dev->driver_info->flags & FLAG_FRAMING_Z) ? "Zaurus" : (dev->driver_info->flags & FLAG_FRAMING_RN) ? "RNDIS" : (dev->driver_info->flags & FLAG_FRAMING_AX) ? "ASIX" : "simple"); /* reset rx error state */ dev->pkt_cnt = 0; dev->pkt_err = 0; clear_bit(EVENT_RX_KILL, &dev->flags); // delay posting reads until we're fully open tasklet_schedule (&dev->bh); if (info->manage_power) { retval = info->manage_power(dev, 1); if (retval < 0) { retval = 0; set_bit(EVENT_NO_RUNTIME_PM, &dev->flags); } else { usb_autopm_put_interface(dev->intf); } } return retval; done: usb_autopm_put_interface(dev->intf); done_nopm: return retval; } EXPORT_SYMBOL_GPL(usbnet_open); /*-------------------------------------------------------------------------*/ /* ethtool methods; minidrivers may need to add some more, but * they'll probably want to use this base set. */ /* These methods are written on the assumption that the device * uses MII */ int usbnet_get_link_ksettings_mii(struct net_device *net, struct ethtool_link_ksettings *cmd) { struct usbnet *dev = netdev_priv(net); if (!dev->mii.mdio_read) return -EOPNOTSUPP; mii_ethtool_get_link_ksettings(&dev->mii, cmd); return 0; } EXPORT_SYMBOL_GPL(usbnet_get_link_ksettings_mii); int usbnet_get_link_ksettings_internal(struct net_device *net, struct ethtool_link_ksettings *cmd) { struct usbnet *dev = netdev_priv(net); /* the assumption that speed is equal on tx and rx * is deeply engrained into the networking layer. * For wireless stuff it is not true. * We assume that rx_speed matters more. */ if (dev->rx_speed != SPEED_UNSET) cmd->base.speed = dev->rx_speed / 1000000; else if (dev->tx_speed != SPEED_UNSET) cmd->base.speed = dev->tx_speed / 1000000; else cmd->base.speed = SPEED_UNKNOWN; return 0; } EXPORT_SYMBOL_GPL(usbnet_get_link_ksettings_internal); int usbnet_set_link_ksettings_mii(struct net_device *net, const struct ethtool_link_ksettings *cmd) { struct usbnet *dev = netdev_priv(net); int retval; if (!dev->mii.mdio_write) return -EOPNOTSUPP; retval = mii_ethtool_set_link_ksettings(&dev->mii, cmd); /* link speed/duplex might have changed */ if (dev->driver_info->link_reset) dev->driver_info->link_reset(dev); /* hard_mtu or rx_urb_size may change in link_reset() */ usbnet_update_max_qlen(dev); return retval; } EXPORT_SYMBOL_GPL(usbnet_set_link_ksettings_mii); u32 usbnet_get_link (struct net_device *net) { struct usbnet *dev = netdev_priv(net); /* If a check_connect is defined, return its result */ if (dev->driver_info->check_connect) return dev->driver_info->check_connect (dev) == 0; /* if the device has mii operations, use those */ if (dev->mii.mdio_read) return mii_link_ok(&dev->mii); /* Otherwise, dtrt for drivers calling netif_carrier_{on,off} */ return ethtool_op_get_link(net); } EXPORT_SYMBOL_GPL(usbnet_get_link); int usbnet_nway_reset(struct net_device *net) { struct usbnet *dev = netdev_priv(net); if (!dev->mii.mdio_write) return -EOPNOTSUPP; return mii_nway_restart(&dev->mii); } EXPORT_SYMBOL_GPL(usbnet_nway_reset); void usbnet_get_drvinfo (struct net_device *net, struct ethtool_drvinfo *info) { struct usbnet *dev = netdev_priv(net); strscpy(info->driver, dev->driver_name, sizeof(info->driver)); strscpy(info->fw_version, dev->driver_info->description, sizeof(info->fw_version)); usb_make_path (dev->udev, info->bus_info, sizeof info->bus_info); } EXPORT_SYMBOL_GPL(usbnet_get_drvinfo); u32 usbnet_get_msglevel (struct net_device *net) { struct usbnet *dev = netdev_priv(net); return dev->msg_enable; } EXPORT_SYMBOL_GPL(usbnet_get_msglevel); void usbnet_set_msglevel (struct net_device *net, u32 level) { struct usbnet *dev = netdev_priv(net); dev->msg_enable = level; } EXPORT_SYMBOL_GPL(usbnet_set_msglevel); /* drivers may override default ethtool_ops in their bind() routine */ static const struct ethtool_ops usbnet_ethtool_ops = { .get_link = usbnet_get_link, .nway_reset = usbnet_nway_reset, .get_drvinfo = usbnet_get_drvinfo, .get_msglevel = usbnet_get_msglevel, .set_msglevel = usbnet_set_msglevel, .get_ts_info = ethtool_op_get_ts_info, .get_link_ksettings = usbnet_get_link_ksettings_mii, .set_link_ksettings = usbnet_set_link_ksettings_mii, }; /*-------------------------------------------------------------------------*/ static void __handle_link_change(struct usbnet *dev) { if (!test_bit(EVENT_DEV_OPEN, &dev->flags)) return; if (!netif_carrier_ok(dev->net)) { /* kill URBs for reading packets to save bus bandwidth */ unlink_urbs(dev, &dev->rxq); /* * tx_timeout will unlink URBs for sending packets and * tx queue is stopped by netcore after link becomes off */ } else { /* submitting URBs for reading packets */ tasklet_schedule(&dev->bh); } /* hard_mtu or rx_urb_size may change during link change */ usbnet_update_max_qlen(dev); clear_bit(EVENT_LINK_CHANGE, &dev->flags); } void usbnet_set_rx_mode(struct net_device *net) { struct usbnet *dev = netdev_priv(net); usbnet_defer_kevent(dev, EVENT_SET_RX_MODE); } EXPORT_SYMBOL_GPL(usbnet_set_rx_mode); static void __handle_set_rx_mode(struct usbnet *dev) { if (dev->driver_info->set_rx_mode) (dev->driver_info->set_rx_mode)(dev); clear_bit(EVENT_SET_RX_MODE, &dev->flags); } /* work that cannot be done in interrupt context uses keventd. * * NOTE: with 2.5 we could do more of this using completion callbacks, * especially now that control transfers can be queued. */ static void usbnet_deferred_kevent (struct work_struct *work) { struct usbnet *dev = container_of(work, struct usbnet, kevent); int status; /* usb_clear_halt() needs a thread context */ if (test_bit (EVENT_TX_HALT, &dev->flags)) { unlink_urbs (dev, &dev->txq); status = usb_autopm_get_interface(dev->intf); if (status < 0) goto fail_pipe; status = usb_clear_halt (dev->udev, dev->out); usb_autopm_put_interface(dev->intf); if (status < 0 && status != -EPIPE && status != -ESHUTDOWN) { if (netif_msg_tx_err (dev)) fail_pipe: netdev_err(dev->net, "can't clear tx halt, status %d\n", status); } else { clear_bit (EVENT_TX_HALT, &dev->flags); if (status != -ESHUTDOWN) netif_wake_queue (dev->net); } } if (test_bit (EVENT_RX_HALT, &dev->flags)) { unlink_urbs (dev, &dev->rxq); status = usb_autopm_get_interface(dev->intf); if (status < 0) goto fail_halt; status = usb_clear_halt (dev->udev, dev->in); usb_autopm_put_interface(dev->intf); if (status < 0 && status != -EPIPE && status != -ESHUTDOWN) { if (netif_msg_rx_err (dev)) fail_halt: netdev_err(dev->net, "can't clear rx halt, status %d\n", status); } else { clear_bit (EVENT_RX_HALT, &dev->flags); tasklet_schedule (&dev->bh); } } /* tasklet could resubmit itself forever if memory is tight */ if (test_bit (EVENT_RX_MEMORY, &dev->flags)) { struct urb *urb = NULL; int resched = 1; if (netif_running (dev->net)) urb = usb_alloc_urb (0, GFP_KERNEL); else clear_bit (EVENT_RX_MEMORY, &dev->flags); if (urb != NULL) { clear_bit (EVENT_RX_MEMORY, &dev->flags); status = usb_autopm_get_interface(dev->intf); if (status < 0) { usb_free_urb(urb); goto fail_lowmem; } if (rx_submit (dev, urb, GFP_KERNEL) == -ENOLINK) resched = 0; usb_autopm_put_interface(dev->intf); fail_lowmem: if (resched) tasklet_schedule (&dev->bh); } } if (test_bit (EVENT_LINK_RESET, &dev->flags)) { const struct driver_info *info = dev->driver_info; int retval = 0; clear_bit (EVENT_LINK_RESET, &dev->flags); status = usb_autopm_get_interface(dev->intf); if (status < 0) goto skip_reset; if(info->link_reset && (retval = info->link_reset(dev)) < 0) { usb_autopm_put_interface(dev->intf); skip_reset: netdev_info(dev->net, "link reset failed (%d) usbnet usb-%s-%s, %s\n", retval, dev->udev->bus->bus_name, dev->udev->devpath, info->description); } else { usb_autopm_put_interface(dev->intf); } /* handle link change from link resetting */ __handle_link_change(dev); } if (test_bit (EVENT_LINK_CHANGE, &dev->flags)) __handle_link_change(dev); if (test_bit (EVENT_SET_RX_MODE, &dev->flags)) __handle_set_rx_mode(dev); if (dev->flags) netdev_dbg(dev->net, "kevent done, flags = 0x%lx\n", dev->flags); } /*-------------------------------------------------------------------------*/ static void tx_complete (struct urb *urb) { struct sk_buff *skb = (struct sk_buff *) urb->context; struct skb_data *entry = (struct skb_data *) skb->cb; struct usbnet *dev = entry->dev; if (urb->status == 0) { struct pcpu_sw_netstats *stats64 = this_cpu_ptr(dev->net->tstats); unsigned long flags; flags = u64_stats_update_begin_irqsave(&stats64->syncp); u64_stats_add(&stats64->tx_packets, entry->packets); u64_stats_add(&stats64->tx_bytes, entry->length); u64_stats_update_end_irqrestore(&stats64->syncp, flags); } else { dev->net->stats.tx_errors++; switch (urb->status) { case -EPIPE: usbnet_defer_kevent (dev, EVENT_TX_HALT); break; /* software-driven interface shutdown */ case -ECONNRESET: // async unlink case -ESHUTDOWN: // hardware gone break; /* like rx, tx gets controller i/o faults during hub_wq * delays and so it uses the same throttling mechanism. */ case -EPROTO: case -ETIME: case -EILSEQ: usb_mark_last_busy(dev->udev); if (!timer_pending (&dev->delay)) { mod_timer (&dev->delay, jiffies + THROTTLE_JIFFIES); netif_dbg(dev, link, dev->net, "tx throttle %d\n", urb->status); } netif_stop_queue (dev->net); break; default: netif_dbg(dev, tx_err, dev->net, "tx err %d\n", entry->urb->status); break; } } usb_autopm_put_interface_async(dev->intf); (void) defer_bh(dev, skb, &dev->txq, tx_done); } /*-------------------------------------------------------------------------*/ void usbnet_tx_timeout (struct net_device *net, unsigned int txqueue) { struct usbnet *dev = netdev_priv(net); unlink_urbs (dev, &dev->txq); tasklet_schedule (&dev->bh); /* this needs to be handled individually because the generic layer * doesn't know what is sufficient and could not restore private * information if a remedy of an unconditional reset were used. */ if (dev->driver_info->recover) (dev->driver_info->recover)(dev); } EXPORT_SYMBOL_GPL(usbnet_tx_timeout); /*-------------------------------------------------------------------------*/ static int build_dma_sg(const struct sk_buff *skb, struct urb *urb) { unsigned num_sgs, total_len = 0; int i, s = 0; num_sgs = skb_shinfo(skb)->nr_frags + 1; if (num_sgs == 1) return 0; /* reserve one for zero packet */ urb->sg = kmalloc_array(num_sgs + 1, sizeof(struct scatterlist), GFP_ATOMIC); if (!urb->sg) return -ENOMEM; urb->num_sgs = num_sgs; sg_init_table(urb->sg, urb->num_sgs + 1); sg_set_buf(&urb->sg[s++], skb->data, skb_headlen(skb)); total_len += skb_headlen(skb); for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *f = &skb_shinfo(skb)->frags[i]; total_len += skb_frag_size(f); sg_set_page(&urb->sg[i + s], skb_frag_page(f), skb_frag_size(f), skb_frag_off(f)); } urb->transfer_buffer_length = total_len; return 1; } netdev_tx_t usbnet_start_xmit (struct sk_buff *skb, struct net_device *net) { struct usbnet *dev = netdev_priv(net); unsigned int length; struct urb *urb = NULL; struct skb_data *entry; const struct driver_info *info = dev->driver_info; unsigned long flags; int retval; if (skb) skb_tx_timestamp(skb); // some devices want funky USB-level framing, for // win32 driver (usually) and/or hardware quirks if (info->tx_fixup) { skb = info->tx_fixup (dev, skb, GFP_ATOMIC); if (!skb) { /* packet collected; minidriver waiting for more */ if (info->flags & FLAG_MULTI_PACKET) goto not_drop; netif_dbg(dev, tx_err, dev->net, "can't tx_fixup skb\n"); goto drop; } } if (!(urb = usb_alloc_urb (0, GFP_ATOMIC))) { netif_dbg(dev, tx_err, dev->net, "no urb\n"); goto drop; } entry = (struct skb_data *) skb->cb; entry->urb = urb; entry->dev = dev; usb_fill_bulk_urb (urb, dev->udev, dev->out, skb->data, skb->len, tx_complete, skb); if (dev->can_dma_sg) { if (build_dma_sg(skb, urb) < 0) goto drop; } length = urb->transfer_buffer_length; /* don't assume the hardware handles USB_ZERO_PACKET * NOTE: strictly conforming cdc-ether devices should expect * the ZLP here, but ignore the one-byte packet. * NOTE2: CDC NCM specification is different from CDC ECM when * handling ZLP/short packets, so cdc_ncm driver will make short * packet itself if needed. */ if (length % dev->maxpacket == 0) { if (!(info->flags & FLAG_SEND_ZLP)) { if (!(info->flags & FLAG_MULTI_PACKET)) { length++; if (skb_tailroom(skb) && !urb->num_sgs) { skb->data[skb->len] = 0; __skb_put(skb, 1); } else if (urb->num_sgs) sg_set_buf(&urb->sg[urb->num_sgs++], dev->padding_pkt, 1); } } else urb->transfer_flags |= URB_ZERO_PACKET; } urb->transfer_buffer_length = length; if (info->flags & FLAG_MULTI_PACKET) { /* Driver has set number of packets and a length delta. * Calculate the complete length and ensure that it's * positive. */ entry->length += length; if (WARN_ON_ONCE(entry->length <= 0)) entry->length = length; } else { usbnet_set_skb_tx_stats(skb, 1, length); } spin_lock_irqsave(&dev->txq.lock, flags); retval = usb_autopm_get_interface_async(dev->intf); if (retval < 0) { spin_unlock_irqrestore(&dev->txq.lock, flags); goto drop; } if (netif_queue_stopped(net)) { usb_autopm_put_interface_async(dev->intf); spin_unlock_irqrestore(&dev->txq.lock, flags); goto drop; } #ifdef CONFIG_PM /* if this triggers the device is still a sleep */ if (test_bit(EVENT_DEV_ASLEEP, &dev->flags)) { /* transmission will be done in resume */ usb_anchor_urb(urb, &dev->deferred); /* no use to process more packets */ netif_stop_queue(net); usb_put_urb(urb); spin_unlock_irqrestore(&dev->txq.lock, flags); netdev_dbg(dev->net, "Delaying transmission for resumption\n"); goto deferred; } #endif switch ((retval = usb_submit_urb (urb, GFP_ATOMIC))) { case -EPIPE: netif_stop_queue (net); usbnet_defer_kevent (dev, EVENT_TX_HALT); usb_autopm_put_interface_async(dev->intf); break; default: usb_autopm_put_interface_async(dev->intf); netif_dbg(dev, tx_err, dev->net, "tx: submit urb err %d\n", retval); break; case 0: netif_trans_update(net); __usbnet_queue_skb(&dev->txq, skb, tx_start); if (dev->txq.qlen >= TX_QLEN (dev)) netif_stop_queue (net); } spin_unlock_irqrestore (&dev->txq.lock, flags); if (retval) { netif_dbg(dev, tx_err, dev->net, "drop, code %d\n", retval); drop: dev->net->stats.tx_dropped++; not_drop: if (skb) dev_kfree_skb_any (skb); if (urb) { kfree(urb->sg); usb_free_urb(urb); } } else netif_dbg(dev, tx_queued, dev->net, "> tx, len %u, type 0x%x\n", length, skb->protocol); #ifdef CONFIG_PM deferred: #endif return NETDEV_TX_OK; } EXPORT_SYMBOL_GPL(usbnet_start_xmit); static int rx_alloc_submit(struct usbnet *dev, gfp_t flags) { struct urb *urb; int i; int ret = 0; /* don't refill the queue all at once */ for (i = 0; i < 10 && dev->rxq.qlen < RX_QLEN(dev); i++) { urb = usb_alloc_urb(0, flags); if (urb != NULL) { ret = rx_submit(dev, urb, flags); if (ret) goto err; } else { ret = -ENOMEM; goto err; } } err: return ret; } static inline void usb_free_skb(struct sk_buff *skb) { struct skb_data *entry = (struct skb_data *)skb->cb; usb_free_urb(entry->urb); dev_kfree_skb(skb); } /*-------------------------------------------------------------------------*/ // tasklet (work deferred from completions, in_irq) or timer static void usbnet_bh (struct timer_list *t) { struct usbnet *dev = from_timer(dev, t, delay); struct sk_buff *skb; struct skb_data *entry; while ((skb = skb_dequeue (&dev->done))) { entry = (struct skb_data *) skb->cb; switch (entry->state) { case rx_done: if (rx_process(dev, skb)) usb_free_skb(skb); continue; case tx_done: kfree(entry->urb->sg); fallthrough; case rx_cleanup: usb_free_skb(skb); continue; default: netdev_dbg(dev->net, "bogus skb state %d\n", entry->state); } } /* restart RX again after disabling due to high error rate */ clear_bit(EVENT_RX_KILL, &dev->flags); /* waiting for all pending urbs to complete? * only then can we forgo submitting anew */ if (waitqueue_active(&dev->wait)) { if (dev->txq.qlen + dev->rxq.qlen + dev->done.qlen == 0) wake_up_all(&dev->wait); // or are we maybe short a few urbs? } else if (netif_running (dev->net) && netif_device_present (dev->net) && netif_carrier_ok(dev->net) && !timer_pending(&dev->delay) && !test_bit(EVENT_RX_PAUSED, &dev->flags) && !test_bit(EVENT_RX_HALT, &dev->flags)) { int temp = dev->rxq.qlen; if (temp < RX_QLEN(dev)) { if (rx_alloc_submit(dev, GFP_ATOMIC) == -ENOLINK) return; if (temp != dev->rxq.qlen) netif_dbg(dev, link, dev->net, "rxqlen %d --> %d\n", temp, dev->rxq.qlen); if (dev->rxq.qlen < RX_QLEN(dev)) tasklet_schedule (&dev->bh); } if (dev->txq.qlen < TX_QLEN (dev)) netif_wake_queue (dev->net); } } static void usbnet_bh_tasklet(struct tasklet_struct *t) { struct usbnet *dev = from_tasklet(dev, t, bh); usbnet_bh(&dev->delay); } /*------------------------------------------------------------------------- * * USB Device Driver support * *-------------------------------------------------------------------------*/ // precondition: never called in_interrupt void usbnet_disconnect (struct usb_interface *intf) { struct usbnet *dev; struct usb_device *xdev; struct net_device *net; struct urb *urb; dev = usb_get_intfdata(intf); usb_set_intfdata(intf, NULL); if (!dev) return; xdev = interface_to_usbdev (intf); netif_info(dev, probe, dev->net, "unregister '%s' usb-%s-%s, %s\n", intf->dev.driver->name, xdev->bus->bus_name, xdev->devpath, dev->driver_info->description); net = dev->net; unregister_netdev (net); while ((urb = usb_get_from_anchor(&dev->deferred))) { dev_kfree_skb(urb->context); kfree(urb->sg); usb_free_urb(urb); } if (dev->driver_info->unbind) dev->driver_info->unbind(dev, intf); usb_kill_urb(dev->interrupt); usb_free_urb(dev->interrupt); kfree(dev->padding_pkt); free_netdev(net); } EXPORT_SYMBOL_GPL(usbnet_disconnect); static const struct net_device_ops usbnet_netdev_ops = { .ndo_open = usbnet_open, .ndo_stop = usbnet_stop, .ndo_start_xmit = usbnet_start_xmit, .ndo_tx_timeout = usbnet_tx_timeout, .ndo_set_rx_mode = usbnet_set_rx_mode, .ndo_change_mtu = usbnet_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; /*-------------------------------------------------------------------------*/ // precondition: never called in_interrupt static const struct device_type wlan_type = { .name = "wlan", }; static const struct device_type wwan_type = { .name = "wwan", }; int usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod) { struct usbnet *dev; struct net_device *net; struct usb_host_interface *interface; const struct driver_info *info; struct usb_device *xdev; int status; const char *name; struct usb_driver *driver = to_usb_driver(udev->dev.driver); /* usbnet already took usb runtime pm, so have to enable the feature * for usb interface, otherwise usb_autopm_get_interface may return * failure if RUNTIME_PM is enabled. */ if (!driver->supports_autosuspend) { driver->supports_autosuspend = 1; pm_runtime_enable(&udev->dev); } name = udev->dev.driver->name; info = (const struct driver_info *) prod->driver_info; if (!info) { dev_dbg (&udev->dev, "blacklisted by %s\n", name); return -ENODEV; } xdev = interface_to_usbdev (udev); interface = udev->cur_altsetting; status = -ENOMEM; // set up our own records net = alloc_etherdev(sizeof(*dev)); if (!net) goto out; /* netdev_printk() needs this so do it as early as possible */ SET_NETDEV_DEV(net, &udev->dev); dev = netdev_priv(net); dev->udev = xdev; dev->intf = udev; dev->driver_info = info; dev->driver_name = name; dev->rx_speed = SPEED_UNSET; dev->tx_speed = SPEED_UNSET; dev->msg_enable = netif_msg_init (msg_level, NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK); init_waitqueue_head(&dev->wait); skb_queue_head_init (&dev->rxq); skb_queue_head_init (&dev->txq); skb_queue_head_init (&dev->done); skb_queue_head_init(&dev->rxq_pause); tasklet_setup(&dev->bh, usbnet_bh_tasklet); INIT_WORK (&dev->kevent, usbnet_deferred_kevent); init_usb_anchor(&dev->deferred); timer_setup(&dev->delay, usbnet_bh, 0); mutex_init (&dev->phy_mutex); mutex_init(&dev->interrupt_mutex); dev->interrupt_count = 0; dev->net = net; strscpy(net->name, "usb%d", sizeof(net->name)); eth_hw_addr_set(net, node_id); /* rx and tx sides can use different message sizes; * bind() should set rx_urb_size in that case. */ dev->hard_mtu = net->mtu + net->hard_header_len; net->min_mtu = 0; net->max_mtu = ETH_MAX_MTU; net->netdev_ops = &usbnet_netdev_ops; net->watchdog_timeo = TX_TIMEOUT_JIFFIES; net->ethtool_ops = &usbnet_ethtool_ops; net->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; // allow device-specific bind/init procedures // NOTE net->name still not usable ... if (info->bind) { status = info->bind (dev, udev); if (status < 0) goto out1; // heuristic: "usb%d" for links we know are two-host, // else "eth%d" when there's reasonable doubt. userspace // can rename the link if it knows better. if ((dev->driver_info->flags & FLAG_ETHER) != 0 && ((dev->driver_info->flags & FLAG_POINTTOPOINT) == 0 || (net->dev_addr [0] & 0x02) == 0)) strscpy(net->name, "eth%d", sizeof(net->name)); /* WLAN devices should always be named "wlan%d" */ if ((dev->driver_info->flags & FLAG_WLAN) != 0) strscpy(net->name, "wlan%d", sizeof(net->name)); /* WWAN devices should always be named "wwan%d" */ if ((dev->driver_info->flags & FLAG_WWAN) != 0) strscpy(net->name, "wwan%d", sizeof(net->name)); /* devices that cannot do ARP */ if ((dev->driver_info->flags & FLAG_NOARP) != 0) net->flags |= IFF_NOARP; /* maybe the remote can't receive an Ethernet MTU */ if (net->mtu > (dev->hard_mtu - net->hard_header_len)) net->mtu = dev->hard_mtu - net->hard_header_len; } else if (!info->in || !info->out) status = usbnet_get_endpoints (dev, udev); else { u8 ep_addrs[3] = { info->in + USB_DIR_IN, info->out + USB_DIR_OUT, 0 }; dev->in = usb_rcvbulkpipe (xdev, info->in); dev->out = usb_sndbulkpipe (xdev, info->out); if (!(info->flags & FLAG_NO_SETINT)) status = usb_set_interface (xdev, interface->desc.bInterfaceNumber, interface->desc.bAlternateSetting); else status = 0; if (status == 0 && !usb_check_bulk_endpoints(udev, ep_addrs)) status = -EINVAL; } if (status >= 0 && dev->status) status = init_status (dev, udev); if (status < 0) goto out3; if (!dev->rx_urb_size) dev->rx_urb_size = dev->hard_mtu; dev->maxpacket = usb_maxpacket(dev->udev, dev->out); if (dev->maxpacket == 0) { /* that is a broken device */ status = -ENODEV; goto out4; } /* let userspace know we have a random address */ if (ether_addr_equal(net->dev_addr, node_id)) net->addr_assign_type = NET_ADDR_RANDOM; if ((dev->driver_info->flags & FLAG_WLAN) != 0) SET_NETDEV_DEVTYPE(net, &wlan_type); if ((dev->driver_info->flags & FLAG_WWAN) != 0) SET_NETDEV_DEVTYPE(net, &wwan_type); /* initialize max rx_qlen and tx_qlen */ usbnet_update_max_qlen(dev); if (dev->can_dma_sg && !(info->flags & FLAG_SEND_ZLP) && !(info->flags & FLAG_MULTI_PACKET)) { dev->padding_pkt = kzalloc(1, GFP_KERNEL); if (!dev->padding_pkt) { status = -ENOMEM; goto out4; } } status = register_netdev (net); if (status) goto out5; netif_info(dev, probe, dev->net, "register '%s' at usb-%s-%s, %s, %pM\n", udev->dev.driver->name, xdev->bus->bus_name, xdev->devpath, dev->driver_info->description, net->dev_addr); // ok, it's ready to go. usb_set_intfdata (udev, dev); netif_device_attach (net); if (dev->driver_info->flags & FLAG_LINK_INTR) usbnet_link_change(dev, 0, 0); return 0; out5: kfree(dev->padding_pkt); out4: usb_free_urb(dev->interrupt); out3: if (info->unbind) info->unbind (dev, udev); out1: /* subdrivers must undo all they did in bind() if they * fail it, but we may fail later and a deferred kevent * may trigger an error resubmitting itself and, worse, * schedule a timer. So we kill it all just in case. */ cancel_work_sync(&dev->kevent); del_timer_sync(&dev->delay); free_netdev(net); out: return status; } EXPORT_SYMBOL_GPL(usbnet_probe); /*-------------------------------------------------------------------------*/ /* * suspend the whole driver as soon as the first interface is suspended * resume only when the last interface is resumed */ int usbnet_suspend (struct usb_interface *intf, pm_message_t message) { struct usbnet *dev = usb_get_intfdata(intf); if (!dev->suspend_count++) { spin_lock_irq(&dev->txq.lock); /* don't autosuspend while transmitting */ if (dev->txq.qlen && PMSG_IS_AUTO(message)) { dev->suspend_count--; spin_unlock_irq(&dev->txq.lock); return -EBUSY; } else { set_bit(EVENT_DEV_ASLEEP, &dev->flags); spin_unlock_irq(&dev->txq.lock); } /* * accelerate emptying of the rx and queues, to avoid * having everything error out. */ netif_device_detach (dev->net); usbnet_terminate_urbs(dev); __usbnet_status_stop_force(dev); /* * reattach so runtime management can use and * wake the device */ netif_device_attach (dev->net); } return 0; } EXPORT_SYMBOL_GPL(usbnet_suspend); int usbnet_resume (struct usb_interface *intf) { struct usbnet *dev = usb_get_intfdata(intf); struct sk_buff *skb; struct urb *res; int retval; if (!--dev->suspend_count) { /* resume interrupt URB if it was previously submitted */ __usbnet_status_start_force(dev, GFP_NOIO); spin_lock_irq(&dev->txq.lock); while ((res = usb_get_from_anchor(&dev->deferred))) { skb = (struct sk_buff *)res->context; retval = usb_submit_urb(res, GFP_ATOMIC); if (retval < 0) { dev_kfree_skb_any(skb); kfree(res->sg); usb_free_urb(res); usb_autopm_put_interface_async(dev->intf); } else { netif_trans_update(dev->net); __skb_queue_tail(&dev->txq, skb); } } smp_mb(); clear_bit(EVENT_DEV_ASLEEP, &dev->flags); spin_unlock_irq(&dev->txq.lock); if (test_bit(EVENT_DEV_OPEN, &dev->flags)) { /* handle remote wakeup ASAP * we cannot race against stop */ if (netif_device_present(dev->net) && !timer_pending(&dev->delay) && !test_bit(EVENT_RX_HALT, &dev->flags)) rx_alloc_submit(dev, GFP_NOIO); if (!(dev->txq.qlen >= TX_QLEN(dev))) netif_tx_wake_all_queues(dev->net); tasklet_schedule (&dev->bh); } } if (test_and_clear_bit(EVENT_DEVICE_REPORT_IDLE, &dev->flags)) usb_autopm_get_interface_no_resume(intf); return 0; } EXPORT_SYMBOL_GPL(usbnet_resume); /* * Either a subdriver implements manage_power, then it is assumed to always * be ready to be suspended or it reports the readiness to be suspended * explicitly */ void usbnet_device_suggests_idle(struct usbnet *dev) { if (!test_and_set_bit(EVENT_DEVICE_REPORT_IDLE, &dev->flags)) { dev->intf->needs_remote_wakeup = 1; usb_autopm_put_interface_async(dev->intf); } } EXPORT_SYMBOL(usbnet_device_suggests_idle); /* * For devices that can do without special commands */ int usbnet_manage_power(struct usbnet *dev, int on) { dev->intf->needs_remote_wakeup = on; return 0; } EXPORT_SYMBOL(usbnet_manage_power); void usbnet_link_change(struct usbnet *dev, bool link, bool need_reset) { /* update link after link is reseted */ if (link && !need_reset) netif_carrier_on(dev->net); else netif_carrier_off(dev->net); if (need_reset && link) usbnet_defer_kevent(dev, EVENT_LINK_RESET); else usbnet_defer_kevent(dev, EVENT_LINK_CHANGE); } EXPORT_SYMBOL(usbnet_link_change); /*-------------------------------------------------------------------------*/ static int __usbnet_read_cmd(struct usbnet *dev, u8 cmd, u8 reqtype, u16 value, u16 index, void *data, u16 size) { void *buf = NULL; int err = -ENOMEM; netdev_dbg(dev->net, "usbnet_read_cmd cmd=0x%02x reqtype=%02x" " value=0x%04x index=0x%04x size=%d\n", cmd, reqtype, value, index, size); if (size) { buf = kmalloc(size, GFP_NOIO); if (!buf) goto out; } err = usb_control_msg(dev->udev, usb_rcvctrlpipe(dev->udev, 0), cmd, reqtype, value, index, buf, size, USB_CTRL_GET_TIMEOUT); if (err > 0 && err <= size) { if (data) memcpy(data, buf, err); else netdev_dbg(dev->net, "Huh? Data requested but thrown away.\n"); } kfree(buf); out: return err; } static int __usbnet_write_cmd(struct usbnet *dev, u8 cmd, u8 reqtype, u16 value, u16 index, const void *data, u16 size) { void *buf = NULL; int err = -ENOMEM; netdev_dbg(dev->net, "usbnet_write_cmd cmd=0x%02x reqtype=%02x" " value=0x%04x index=0x%04x size=%d\n", cmd, reqtype, value, index, size); if (data) { buf = kmemdup(data, size, GFP_NOIO); if (!buf) goto out; } else { if (size) { WARN_ON_ONCE(1); err = -EINVAL; goto out; } } err = usb_control_msg(dev->udev, usb_sndctrlpipe(dev->udev, 0), cmd, reqtype, value, index, buf, size, USB_CTRL_SET_TIMEOUT); kfree(buf); out: return err; } /* * The function can't be called inside suspend/resume callback, * otherwise deadlock will be caused. */ int usbnet_read_cmd(struct usbnet *dev, u8 cmd, u8 reqtype, u16 value, u16 index, void *data, u16 size) { int ret; if (usb_autopm_get_interface(dev->intf) < 0) return -ENODEV; ret = __usbnet_read_cmd(dev, cmd, reqtype, value, index, data, size); usb_autopm_put_interface(dev->intf); return ret; } EXPORT_SYMBOL_GPL(usbnet_read_cmd); /* * The function can't be called inside suspend/resume callback, * otherwise deadlock will be caused. */ int usbnet_write_cmd(struct usbnet *dev, u8 cmd, u8 reqtype, u16 value, u16 index, const void *data, u16 size) { int ret; if (usb_autopm_get_interface(dev->intf) < 0) return -ENODEV; ret = __usbnet_write_cmd(dev, cmd, reqtype, value, index, data, size); usb_autopm_put_interface(dev->intf); return ret; } EXPORT_SYMBOL_GPL(usbnet_write_cmd); /* * The function can be called inside suspend/resume callback safely * and should only be called by suspend/resume callback generally. */ int usbnet_read_cmd_nopm(struct usbnet *dev, u8 cmd, u8 reqtype, u16 value, u16 index, void *data, u16 size) { return __usbnet_read_cmd(dev, cmd, reqtype, value, index, data, size); } EXPORT_SYMBOL_GPL(usbnet_read_cmd_nopm); /* * The function can be called inside suspend/resume callback safely * and should only be called by suspend/resume callback generally. */ int usbnet_write_cmd_nopm(struct usbnet *dev, u8 cmd, u8 reqtype, u16 value, u16 index, const void *data, u16 size) { return __usbnet_write_cmd(dev, cmd, reqtype, value, index, data, size); } EXPORT_SYMBOL_GPL(usbnet_write_cmd_nopm); static void usbnet_async_cmd_cb(struct urb *urb) { struct usb_ctrlrequest *req = (struct usb_ctrlrequest *)urb->context; int status = urb->status; if (status < 0) dev_dbg(&urb->dev->dev, "%s failed with %d", __func__, status); kfree(req); usb_free_urb(urb); } /* * The caller must make sure that device can't be put into suspend * state until the control URB completes. */ int usbnet_write_cmd_async(struct usbnet *dev, u8 cmd, u8 reqtype, u16 value, u16 index, const void *data, u16 size) { struct usb_ctrlrequest *req; struct urb *urb; int err = -ENOMEM; void *buf = NULL; netdev_dbg(dev->net, "usbnet_write_cmd cmd=0x%02x reqtype=%02x" " value=0x%04x index=0x%04x size=%d\n", cmd, reqtype, value, index, size); urb = usb_alloc_urb(0, GFP_ATOMIC); if (!urb) goto fail; if (data) { buf = kmemdup(data, size, GFP_ATOMIC); if (!buf) { netdev_err(dev->net, "Error allocating buffer" " in %s!\n", __func__); goto fail_free_urb; } } req = kmalloc(sizeof(struct usb_ctrlrequest), GFP_ATOMIC); if (!req) goto fail_free_buf; req->bRequestType = reqtype; req->bRequest = cmd; req->wValue = cpu_to_le16(value); req->wIndex = cpu_to_le16(index); req->wLength = cpu_to_le16(size); usb_fill_control_urb(urb, dev->udev, usb_sndctrlpipe(dev->udev, 0), (void *)req, buf, size, usbnet_async_cmd_cb, req); urb->transfer_flags |= URB_FREE_BUFFER; err = usb_submit_urb(urb, GFP_ATOMIC); if (err < 0) { netdev_err(dev->net, "Error submitting the control" " message: status=%d\n", err); goto fail_free_all; } return 0; fail_free_all: kfree(req); fail_free_buf: kfree(buf); /* * avoid a double free * needed because the flag can be set only * after filling the URB */ urb->transfer_flags = 0; fail_free_urb: usb_free_urb(urb); fail: return err; } EXPORT_SYMBOL_GPL(usbnet_write_cmd_async); /*-------------------------------------------------------------------------*/ static int __init usbnet_init(void) { /* Compiler should optimize this out. */ BUILD_BUG_ON( sizeof_field(struct sk_buff, cb) < sizeof(struct skb_data)); eth_random_addr(node_id); return 0; } module_init(usbnet_init); static void __exit usbnet_exit(void) { } module_exit(usbnet_exit); MODULE_AUTHOR("David Brownell"); MODULE_DESCRIPTION("USB network driver framework"); MODULE_LICENSE("GPL"); |
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * This file implements the various access functions for the * PROC file system. It is mainly used for debugging and * statistics. * * Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de> * Fred Baumgarten, <dc6iq@insu1.etec.uni-karlsruhe.de> * Erik Schoenfelder, <schoenfr@ibr.cs.tu-bs.de> * * Fixes: * Alan Cox : UDP sockets show the rxqueue/txqueue * using hint flag for the netinfo. * Pauline Middelink : identd support * Alan Cox : Make /proc safer. * Erik Schoenfelder : /proc/net/snmp * Alan Cox : Handle dead sockets properly. * Gerhard Koerting : Show both timers * Alan Cox : Allow inode to be NULL (kernel socket) * Andi Kleen : Add support for open_requests and * split functions for more readibility. * Andi Kleen : Add support for /proc/net/netstat * Arnaldo C. Melo : Convert to seq_file */ #include <linux/types.h> #include <net/net_namespace.h> #include <net/icmp.h> #include <net/protocol.h> #include <net/tcp.h> #include <net/mptcp.h> #include <net/udp.h> #include <net/udplite.h> #include <linux/bottom_half.h> #include <linux/inetdevice.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/export.h> #include <net/sock.h> #include <net/raw.h> #define TCPUDP_MIB_MAX max_t(u32, UDP_MIB_MAX, TCP_MIB_MAX) /* * Report socket allocation statistics [mea@utu.fi] */ static int sockstat_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; int orphans, sockets; orphans = tcp_orphan_count_sum(); sockets = proto_sockets_allocated_sum_positive(&tcp_prot); socket_seq_show(seq); seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", sock_prot_inuse_get(net, &tcp_prot), orphans, refcount_read(&net->ipv4.tcp_death_row.tw_refcount) - 1, sockets, proto_memory_allocated(&tcp_prot)); seq_printf(seq, "UDP: inuse %d mem %ld\n", sock_prot_inuse_get(net, &udp_prot), proto_memory_allocated(&udp_prot)); seq_printf(seq, "UDPLITE: inuse %d\n", sock_prot_inuse_get(net, &udplite_prot)); seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse_get(net, &raw_prot)); seq_printf(seq, "FRAG: inuse %u memory %lu\n", atomic_read(&net->ipv4.fqdir->rhashtable.nelems), frag_mem_limit(net->ipv4.fqdir)); return 0; } /* snmp items */ static const struct snmp_mib snmp4_ipstats_list[] = { SNMP_MIB_ITEM("InReceives", IPSTATS_MIB_INPKTS), SNMP_MIB_ITEM("InHdrErrors", IPSTATS_MIB_INHDRERRORS), SNMP_MIB_ITEM("InAddrErrors", IPSTATS_MIB_INADDRERRORS), SNMP_MIB_ITEM("ForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS), SNMP_MIB_ITEM("InUnknownProtos", IPSTATS_MIB_INUNKNOWNPROTOS), SNMP_MIB_ITEM("InDiscards", IPSTATS_MIB_INDISCARDS), SNMP_MIB_ITEM("InDelivers", IPSTATS_MIB_INDELIVERS), SNMP_MIB_ITEM("OutRequests", IPSTATS_MIB_OUTREQUESTS), SNMP_MIB_ITEM("OutDiscards", IPSTATS_MIB_OUTDISCARDS), SNMP_MIB_ITEM("OutNoRoutes", IPSTATS_MIB_OUTNOROUTES), SNMP_MIB_ITEM("ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT), SNMP_MIB_ITEM("ReasmReqds", IPSTATS_MIB_REASMREQDS), SNMP_MIB_ITEM("ReasmOKs", IPSTATS_MIB_REASMOKS), SNMP_MIB_ITEM("ReasmFails", IPSTATS_MIB_REASMFAILS), SNMP_MIB_ITEM("FragOKs", IPSTATS_MIB_FRAGOKS), SNMP_MIB_ITEM("FragFails", IPSTATS_MIB_FRAGFAILS), SNMP_MIB_ITEM("FragCreates", IPSTATS_MIB_FRAGCREATES), SNMP_MIB_ITEM("OutTransmits", IPSTATS_MIB_OUTPKTS), SNMP_MIB_SENTINEL }; /* Following items are displayed in /proc/net/netstat */ static const struct snmp_mib snmp4_ipextstats_list[] = { SNMP_MIB_ITEM("InNoRoutes", IPSTATS_MIB_INNOROUTES), SNMP_MIB_ITEM("InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS), SNMP_MIB_ITEM("InMcastPkts", IPSTATS_MIB_INMCASTPKTS), SNMP_MIB_ITEM("OutMcastPkts", IPSTATS_MIB_OUTMCASTPKTS), SNMP_MIB_ITEM("InBcastPkts", IPSTATS_MIB_INBCASTPKTS), SNMP_MIB_ITEM("OutBcastPkts", IPSTATS_MIB_OUTBCASTPKTS), SNMP_MIB_ITEM("InOctets", IPSTATS_MIB_INOCTETS), SNMP_MIB_ITEM("OutOctets", IPSTATS_MIB_OUTOCTETS), SNMP_MIB_ITEM("InMcastOctets", IPSTATS_MIB_INMCASTOCTETS), SNMP_MIB_ITEM("OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS), SNMP_MIB_ITEM("InBcastOctets", IPSTATS_MIB_INBCASTOCTETS), SNMP_MIB_ITEM("OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS), /* Non RFC4293 fields */ SNMP_MIB_ITEM("InCsumErrors", IPSTATS_MIB_CSUMERRORS), SNMP_MIB_ITEM("InNoECTPkts", IPSTATS_MIB_NOECTPKTS), SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS), SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS), SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS), SNMP_MIB_ITEM("ReasmOverlaps", IPSTATS_MIB_REASM_OVERLAPS), SNMP_MIB_SENTINEL }; static const struct { const char *name; int index; } icmpmibmap[] = { { "DestUnreachs", ICMP_DEST_UNREACH }, { "TimeExcds", ICMP_TIME_EXCEEDED }, { "ParmProbs", ICMP_PARAMETERPROB }, { "SrcQuenchs", ICMP_SOURCE_QUENCH }, { "Redirects", ICMP_REDIRECT }, { "Echos", ICMP_ECHO }, { "EchoReps", ICMP_ECHOREPLY }, { "Timestamps", ICMP_TIMESTAMP }, { "TimestampReps", ICMP_TIMESTAMPREPLY }, { "AddrMasks", ICMP_ADDRESS }, { "AddrMaskReps", ICMP_ADDRESSREPLY }, { NULL, 0 } }; static const struct snmp_mib snmp4_tcp_list[] = { SNMP_MIB_ITEM("RtoAlgorithm", TCP_MIB_RTOALGORITHM), SNMP_MIB_ITEM("RtoMin", TCP_MIB_RTOMIN), SNMP_MIB_ITEM("RtoMax", TCP_MIB_RTOMAX), SNMP_MIB_ITEM("MaxConn", TCP_MIB_MAXCONN), SNMP_MIB_ITEM("ActiveOpens", TCP_MIB_ACTIVEOPENS), SNMP_MIB_ITEM("PassiveOpens", TCP_MIB_PASSIVEOPENS), SNMP_MIB_ITEM("AttemptFails", TCP_MIB_ATTEMPTFAILS), SNMP_MIB_ITEM("EstabResets", TCP_MIB_ESTABRESETS), SNMP_MIB_ITEM("CurrEstab", TCP_MIB_CURRESTAB), SNMP_MIB_ITEM("InSegs", TCP_MIB_INSEGS), SNMP_MIB_ITEM("OutSegs", TCP_MIB_OUTSEGS), SNMP_MIB_ITEM("RetransSegs", TCP_MIB_RETRANSSEGS), SNMP_MIB_ITEM("InErrs", TCP_MIB_INERRS), SNMP_MIB_ITEM("OutRsts", TCP_MIB_OUTRSTS), SNMP_MIB_ITEM("InCsumErrors", TCP_MIB_CSUMERRORS), SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp4_udp_list[] = { SNMP_MIB_ITEM("InDatagrams", UDP_MIB_INDATAGRAMS), SNMP_MIB_ITEM("NoPorts", UDP_MIB_NOPORTS), SNMP_MIB_ITEM("InErrors", UDP_MIB_INERRORS), SNMP_MIB_ITEM("OutDatagrams", UDP_MIB_OUTDATAGRAMS), SNMP_MIB_ITEM("RcvbufErrors", UDP_MIB_RCVBUFERRORS), SNMP_MIB_ITEM("SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_ITEM("InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_ITEM("IgnoredMulti", UDP_MIB_IGNOREDMULTI), SNMP_MIB_ITEM("MemErrors", UDP_MIB_MEMERRORS), SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("SyncookiesSent", LINUX_MIB_SYNCOOKIESSENT), SNMP_MIB_ITEM("SyncookiesRecv", LINUX_MIB_SYNCOOKIESRECV), SNMP_MIB_ITEM("SyncookiesFailed", LINUX_MIB_SYNCOOKIESFAILED), SNMP_MIB_ITEM("EmbryonicRsts", LINUX_MIB_EMBRYONICRSTS), SNMP_MIB_ITEM("PruneCalled", LINUX_MIB_PRUNECALLED), SNMP_MIB_ITEM("RcvPruned", LINUX_MIB_RCVPRUNED), SNMP_MIB_ITEM("OfoPruned", LINUX_MIB_OFOPRUNED), SNMP_MIB_ITEM("OutOfWindowIcmps", LINUX_MIB_OUTOFWINDOWICMPS), SNMP_MIB_ITEM("LockDroppedIcmps", LINUX_MIB_LOCKDROPPEDICMPS), SNMP_MIB_ITEM("ArpFilter", LINUX_MIB_ARPFILTER), SNMP_MIB_ITEM("TW", LINUX_MIB_TIMEWAITED), SNMP_MIB_ITEM("TWRecycled", LINUX_MIB_TIMEWAITRECYCLED), SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED), SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED), SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED), SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS), SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED), SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST), SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS), SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS), SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS), SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS), SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS), SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY), SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY), SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING), SNMP_MIB_ITEM("TCPSACKReorder", LINUX_MIB_TCPSACKREORDER), SNMP_MIB_ITEM("TCPRenoReorder", LINUX_MIB_TCPRENOREORDER), SNMP_MIB_ITEM("TCPTSReorder", LINUX_MIB_TCPTSREORDER), SNMP_MIB_ITEM("TCPFullUndo", LINUX_MIB_TCPFULLUNDO), SNMP_MIB_ITEM("TCPPartialUndo", LINUX_MIB_TCPPARTIALUNDO), SNMP_MIB_ITEM("TCPDSACKUndo", LINUX_MIB_TCPDSACKUNDO), SNMP_MIB_ITEM("TCPLossUndo", LINUX_MIB_TCPLOSSUNDO), SNMP_MIB_ITEM("TCPLostRetransmit", LINUX_MIB_TCPLOSTRETRANSMIT), SNMP_MIB_ITEM("TCPRenoFailures", LINUX_MIB_TCPRENOFAILURES), SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES), SNMP_MIB_ITEM("TCPLossFailures", LINUX_MIB_TCPLOSSFAILURES), SNMP_MIB_ITEM("TCPFastRetrans", LINUX_MIB_TCPFASTRETRANS), SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS), SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS), SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES), SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY), SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL), SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL), SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED), SNMP_MIB_ITEM("TCPBacklogCoalesce", LINUX_MIB_TCPBACKLOGCOALESCE), SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT), SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT), SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV), SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV), SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA), SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE), SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY), SNMP_MIB_ITEM("TCPAbortOnTimeout", LINUX_MIB_TCPABORTONTIMEOUT), SNMP_MIB_ITEM("TCPAbortOnLinger", LINUX_MIB_TCPABORTONLINGER), SNMP_MIB_ITEM("TCPAbortFailed", LINUX_MIB_TCPABORTFAILED), SNMP_MIB_ITEM("TCPMemoryPressures", LINUX_MIB_TCPMEMORYPRESSURES), SNMP_MIB_ITEM("TCPMemoryPressuresChrono", LINUX_MIB_TCPMEMORYPRESSURESCHRONO), SNMP_MIB_ITEM("TCPSACKDiscard", LINUX_MIB_TCPSACKDISCARD), SNMP_MIB_ITEM("TCPDSACKIgnoredOld", LINUX_MIB_TCPDSACKIGNOREDOLD), SNMP_MIB_ITEM("TCPDSACKIgnoredNoUndo", LINUX_MIB_TCPDSACKIGNOREDNOUNDO), SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS), SNMP_MIB_ITEM("TCPMD5NotFound", LINUX_MIB_TCPMD5NOTFOUND), SNMP_MIB_ITEM("TCPMD5Unexpected", LINUX_MIB_TCPMD5UNEXPECTED), SNMP_MIB_ITEM("TCPMD5Failure", LINUX_MIB_TCPMD5FAILURE), SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED), SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED), SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK), SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP), SNMP_MIB_ITEM("PFMemallocDrop", LINUX_MIB_PFMEMALLOCDROP), SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP), SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP), SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER), SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW), SNMP_MIB_ITEM("TCPReqQFullDoCookies", LINUX_MIB_TCPREQQFULLDOCOOKIES), SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP), SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL), SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE), SNMP_MIB_ITEM("TCPOFOQueue", LINUX_MIB_TCPOFOQUEUE), SNMP_MIB_ITEM("TCPOFODrop", LINUX_MIB_TCPOFODROP), SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE), SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK), SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE), SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE), SNMP_MIB_ITEM("TCPFastOpenActiveFail", LINUX_MIB_TCPFASTOPENACTIVEFAIL), SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE), SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL), SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), SNMP_MIB_ITEM("TCPFastOpenBlackhole", LINUX_MIB_TCPFASTOPENBLACKHOLE), SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS), SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING), SNMP_MIB_ITEM("TCPFromZeroWindowAdv", LINUX_MIB_TCPFROMZEROWINDOWADV), SNMP_MIB_ITEM("TCPToZeroWindowAdv", LINUX_MIB_TCPTOZEROWINDOWADV), SNMP_MIB_ITEM("TCPWantZeroWindowAdv", LINUX_MIB_TCPWANTZEROWINDOWADV), SNMP_MIB_ITEM("TCPSynRetrans", LINUX_MIB_TCPSYNRETRANS), SNMP_MIB_ITEM("TCPOrigDataSent", LINUX_MIB_TCPORIGDATASENT), SNMP_MIB_ITEM("TCPHystartTrainDetect", LINUX_MIB_TCPHYSTARTTRAINDETECT), SNMP_MIB_ITEM("TCPHystartTrainCwnd", LINUX_MIB_TCPHYSTARTTRAINCWND), SNMP_MIB_ITEM("TCPHystartDelayDetect", LINUX_MIB_TCPHYSTARTDELAYDETECT), SNMP_MIB_ITEM("TCPHystartDelayCwnd", LINUX_MIB_TCPHYSTARTDELAYCWND), SNMP_MIB_ITEM("TCPACKSkippedSynRecv", LINUX_MIB_TCPACKSKIPPEDSYNRECV), SNMP_MIB_ITEM("TCPACKSkippedPAWS", LINUX_MIB_TCPACKSKIPPEDPAWS), SNMP_MIB_ITEM("TCPACKSkippedSeq", LINUX_MIB_TCPACKSKIPPEDSEQ), SNMP_MIB_ITEM("TCPACKSkippedFinWait2", LINUX_MIB_TCPACKSKIPPEDFINWAIT2), SNMP_MIB_ITEM("TCPACKSkippedTimeWait", LINUX_MIB_TCPACKSKIPPEDTIMEWAIT), SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE), SNMP_MIB_ITEM("TCPWinProbe", LINUX_MIB_TCPWINPROBE), SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE), SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL), SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS), SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED), SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE), SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED), SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP), SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP), SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG), SNMP_MIB_ITEM("TCPFastOpenPassiveAltKey", LINUX_MIB_TCPFASTOPENPASSIVEALTKEY), SNMP_MIB_ITEM("TcpTimeoutRehash", LINUX_MIB_TCPTIMEOUTREHASH), SNMP_MIB_ITEM("TcpDuplicateDataRehash", LINUX_MIB_TCPDUPLICATEDATAREHASH), SNMP_MIB_ITEM("TCPDSACKRecvSegs", LINUX_MIB_TCPDSACKRECVSEGS), SNMP_MIB_ITEM("TCPDSACKIgnoredDubious", LINUX_MIB_TCPDSACKIGNOREDDUBIOUS), SNMP_MIB_ITEM("TCPMigrateReqSuccess", LINUX_MIB_TCPMIGRATEREQSUCCESS), SNMP_MIB_ITEM("TCPMigrateReqFailure", LINUX_MIB_TCPMIGRATEREQFAILURE), SNMP_MIB_ITEM("TCPPLBRehash", LINUX_MIB_TCPPLBREHASH), SNMP_MIB_ITEM("TCPAORequired", LINUX_MIB_TCPAOREQUIRED), SNMP_MIB_ITEM("TCPAOBad", LINUX_MIB_TCPAOBAD), SNMP_MIB_ITEM("TCPAOKeyNotFound", LINUX_MIB_TCPAOKEYNOTFOUND), SNMP_MIB_ITEM("TCPAOGood", LINUX_MIB_TCPAOGOOD), SNMP_MIB_ITEM("TCPAODroppedIcmps", LINUX_MIB_TCPAODROPPEDICMPS), SNMP_MIB_SENTINEL }; static void icmpmsg_put_line(struct seq_file *seq, unsigned long *vals, unsigned short *type, int count) { int j; if (count) { seq_puts(seq, "\nIcmpMsg:"); for (j = 0; j < count; ++j) seq_printf(seq, " %sType%u", type[j] & 0x100 ? "Out" : "In", type[j] & 0xff); seq_puts(seq, "\nIcmpMsg:"); for (j = 0; j < count; ++j) seq_printf(seq, " %lu", vals[j]); } } static void icmpmsg_put(struct seq_file *seq) { #define PERLINE 16 int i, count; unsigned short type[PERLINE]; unsigned long vals[PERLINE], val; struct net *net = seq->private; count = 0; for (i = 0; i < ICMPMSG_MIB_MAX; i++) { val = atomic_long_read(&net->mib.icmpmsg_statistics->mibs[i]); if (val) { type[count] = i; vals[count++] = val; } if (count == PERLINE) { icmpmsg_put_line(seq, vals, type, count); count = 0; } } icmpmsg_put_line(seq, vals, type, count); #undef PERLINE } static void icmp_put(struct seq_file *seq) { int i; struct net *net = seq->private; atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs; seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors"); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " In%s", icmpmibmap[i].name); seq_puts(seq, " OutMsgs OutErrors OutRateLimitGlobal OutRateLimitHost"); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " Out%s", icmpmibmap[i].name); seq_printf(seq, "\nIcmp: %lu %lu %lu", snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INMSGS), snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INERRORS), snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS)); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + icmpmibmap[i].index)); seq_printf(seq, " %lu %lu %lu %lu", snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS), snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITGLOBAL), snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITHOST)); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + (icmpmibmap[i].index | 0x100))); } /* * Called from the PROCfs module. This outputs /proc/net/snmp. */ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v) { struct net *net = seq->private; u64 buff64[IPSTATS_MIB_MAX]; int i; memset(buff64, 0, IPSTATS_MIB_MAX * sizeof(u64)); seq_puts(seq, "Ip: Forwarding DefaultTTL"); for (i = 0; snmp4_ipstats_list[i].name; i++) seq_printf(seq, " %s", snmp4_ipstats_list[i].name); seq_printf(seq, "\nIp: %d %d", IPV4_DEVCONF_ALL_RO(net, FORWARDING) ? 1 : 2, READ_ONCE(net->ipv4.sysctl_ip_default_ttl)); BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0); snmp_get_cpu_field64_batch(buff64, snmp4_ipstats_list, net->mib.ip_statistics, offsetof(struct ipstats_mib, syncp)); for (i = 0; snmp4_ipstats_list[i].name; i++) seq_printf(seq, " %llu", buff64[i]); return 0; } static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v) { unsigned long buff[TCPUDP_MIB_MAX]; struct net *net = seq->private; int i; memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long)); seq_puts(seq, "\nTcp:"); for (i = 0; snmp4_tcp_list[i].name; i++) seq_printf(seq, " %s", snmp4_tcp_list[i].name); seq_puts(seq, "\nTcp:"); snmp_get_cpu_field_batch(buff, snmp4_tcp_list, net->mib.tcp_statistics); for (i = 0; snmp4_tcp_list[i].name; i++) { /* MaxConn field is signed, RFC 2012 */ if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) seq_printf(seq, " %ld", buff[i]); else seq_printf(seq, " %lu", buff[i]); } memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long)); snmp_get_cpu_field_batch(buff, snmp4_udp_list, net->mib.udp_statistics); seq_puts(seq, "\nUdp:"); for (i = 0; snmp4_udp_list[i].name; i++) seq_printf(seq, " %s", snmp4_udp_list[i].name); seq_puts(seq, "\nUdp:"); for (i = 0; snmp4_udp_list[i].name; i++) seq_printf(seq, " %lu", buff[i]); memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long)); /* the UDP and UDP-Lite MIBs are the same */ seq_puts(seq, "\nUdpLite:"); snmp_get_cpu_field_batch(buff, snmp4_udp_list, net->mib.udplite_statistics); for (i = 0; snmp4_udp_list[i].name; i++) seq_printf(seq, " %s", snmp4_udp_list[i].name); seq_puts(seq, "\nUdpLite:"); for (i = 0; snmp4_udp_list[i].name; i++) seq_printf(seq, " %lu", buff[i]); seq_putc(seq, '\n'); return 0; } static int snmp_seq_show(struct seq_file *seq, void *v) { snmp_seq_show_ipstats(seq, v); icmp_put(seq); /* RFC 2011 compatibility */ icmpmsg_put(seq); snmp_seq_show_tcp_udp(seq, v); return 0; } /* * Output /proc/net/netstat */ static int netstat_seq_show(struct seq_file *seq, void *v) { const int ip_cnt = ARRAY_SIZE(snmp4_ipextstats_list) - 1; const int tcp_cnt = ARRAY_SIZE(snmp4_net_list) - 1; struct net *net = seq->private; unsigned long *buff; int i; seq_puts(seq, "TcpExt:"); for (i = 0; i < tcp_cnt; i++) seq_printf(seq, " %s", snmp4_net_list[i].name); seq_puts(seq, "\nTcpExt:"); buff = kzalloc(max(tcp_cnt * sizeof(long), ip_cnt * sizeof(u64)), GFP_KERNEL); if (buff) { snmp_get_cpu_field_batch(buff, snmp4_net_list, net->mib.net_statistics); for (i = 0; i < tcp_cnt; i++) seq_printf(seq, " %lu", buff[i]); } else { for (i = 0; i < tcp_cnt; i++) seq_printf(seq, " %lu", snmp_fold_field(net->mib.net_statistics, snmp4_net_list[i].entry)); } seq_puts(seq, "\nIpExt:"); for (i = 0; i < ip_cnt; i++) seq_printf(seq, " %s", snmp4_ipextstats_list[i].name); seq_puts(seq, "\nIpExt:"); if (buff) { u64 *buff64 = (u64 *)buff; memset(buff64, 0, ip_cnt * sizeof(u64)); snmp_get_cpu_field64_batch(buff64, snmp4_ipextstats_list, net->mib.ip_statistics, offsetof(struct ipstats_mib, syncp)); for (i = 0; i < ip_cnt; i++) seq_printf(seq, " %llu", buff64[i]); } else { for (i = 0; i < ip_cnt; i++) seq_printf(seq, " %llu", snmp_fold_field64(net->mib.ip_statistics, snmp4_ipextstats_list[i].entry, offsetof(struct ipstats_mib, syncp))); } kfree(buff); seq_putc(seq, '\n'); mptcp_seq_show(seq); return 0; } static __net_init int ip_proc_init_net(struct net *net) { if (!proc_create_net_single("sockstat", 0444, net->proc_net, sockstat_seq_show, NULL)) goto out_sockstat; if (!proc_create_net_single("netstat", 0444, net->proc_net, netstat_seq_show, NULL)) goto out_netstat; if (!proc_create_net_single("snmp", 0444, net->proc_net, snmp_seq_show, NULL)) goto out_snmp; return 0; out_snmp: remove_proc_entry("netstat", net->proc_net); out_netstat: remove_proc_entry("sockstat", net->proc_net); out_sockstat: return -ENOMEM; } static __net_exit void ip_proc_exit_net(struct net *net) { remove_proc_entry("snmp", net->proc_net); remove_proc_entry("netstat", net->proc_net); remove_proc_entry("sockstat", net->proc_net); } static __net_initdata struct pernet_operations ip_proc_ops = { .init = ip_proc_init_net, .exit = ip_proc_exit_net, }; int __init ip_misc_proc_init(void) { return register_pernet_subsys(&ip_proc_ops); } |
2 2 2 2 8 3 5 5 1 1 3 1 9 1 1 4 4 4 3 1 1 1 941 939 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2013-2014 Intel Corp. */ #include <linux/if_arp.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/module.h> #include <linux/debugfs.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/pkt_sched.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/l2cap.h> #include <net/6lowpan.h> /* for the compression support */ #define VERSION "0.1" static struct dentry *lowpan_enable_debugfs; static struct dentry *lowpan_control_debugfs; #define IFACE_NAME_TEMPLATE "bt%d" struct skb_cb { struct in6_addr addr; struct in6_addr gw; struct l2cap_chan *chan; }; #define lowpan_cb(skb) ((struct skb_cb *)((skb)->cb)) /* The devices list contains those devices that we are acting * as a proxy. The BT 6LoWPAN device is a virtual device that * connects to the Bluetooth LE device. The real connection to * BT device is done via l2cap layer. There exists one * virtual device / one BT 6LoWPAN network (=hciX device). * The list contains struct lowpan_dev elements. */ static LIST_HEAD(bt_6lowpan_devices); static DEFINE_SPINLOCK(devices_lock); static bool enable_6lowpan; /* We are listening incoming connections via this channel */ static struct l2cap_chan *listen_chan; static DEFINE_MUTEX(set_lock); struct lowpan_peer { struct list_head list; struct rcu_head rcu; struct l2cap_chan *chan; /* peer addresses in various formats */ unsigned char lladdr[ETH_ALEN]; struct in6_addr peer_addr; }; struct lowpan_btle_dev { struct list_head list; struct hci_dev *hdev; struct net_device *netdev; struct list_head peers; atomic_t peer_count; /* number of items in peers list */ struct work_struct delete_netdev; struct delayed_work notify_peers; }; static inline struct lowpan_btle_dev * lowpan_btle_dev(const struct net_device *netdev) { return (struct lowpan_btle_dev *)lowpan_dev(netdev)->priv; } static inline void peer_add(struct lowpan_btle_dev *dev, struct lowpan_peer *peer) { list_add_rcu(&peer->list, &dev->peers); atomic_inc(&dev->peer_count); } static inline bool peer_del(struct lowpan_btle_dev *dev, struct lowpan_peer *peer) { list_del_rcu(&peer->list); kfree_rcu(peer, rcu); module_put(THIS_MODULE); if (atomic_dec_and_test(&dev->peer_count)) { BT_DBG("last peer"); return true; } return false; } static inline struct lowpan_peer * __peer_lookup_chan(struct lowpan_btle_dev *dev, struct l2cap_chan *chan) { struct lowpan_peer *peer; list_for_each_entry_rcu(peer, &dev->peers, list) { if (peer->chan == chan) return peer; } return NULL; } static inline struct lowpan_peer * __peer_lookup_conn(struct lowpan_btle_dev *dev, struct l2cap_conn *conn) { struct lowpan_peer *peer; list_for_each_entry_rcu(peer, &dev->peers, list) { if (peer->chan->conn == conn) return peer; } return NULL; } static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_btle_dev *dev, struct in6_addr *daddr, struct sk_buff *skb) { struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); int count = atomic_read(&dev->peer_count); const struct in6_addr *nexthop; struct lowpan_peer *peer; struct neighbour *neigh; BT_DBG("peers %d addr %pI6c rt %p", count, daddr, rt); if (!rt) { if (ipv6_addr_any(&lowpan_cb(skb)->gw)) { /* There is neither route nor gateway, * probably the destination is a direct peer. */ nexthop = daddr; } else { /* There is a known gateway */ nexthop = &lowpan_cb(skb)->gw; } } else { nexthop = rt6_nexthop(rt, daddr); /* We need to remember the address because it is needed * by bt_xmit() when sending the packet. In bt_xmit(), the * destination routing info is not set. */ memcpy(&lowpan_cb(skb)->gw, nexthop, sizeof(struct in6_addr)); } BT_DBG("gw %pI6c", nexthop); rcu_read_lock(); list_for_each_entry_rcu(peer, &dev->peers, list) { BT_DBG("dst addr %pMR dst type %u ip %pI6c", &peer->chan->dst, peer->chan->dst_type, &peer->peer_addr); if (!ipv6_addr_cmp(&peer->peer_addr, nexthop)) { rcu_read_unlock(); return peer; } } /* use the neighbour cache for matching addresses assigned by SLAAC */ neigh = __ipv6_neigh_lookup(dev->netdev, nexthop); if (neigh) { list_for_each_entry_rcu(peer, &dev->peers, list) { if (!memcmp(neigh->ha, peer->lladdr, ETH_ALEN)) { neigh_release(neigh); rcu_read_unlock(); return peer; } } neigh_release(neigh); } rcu_read_unlock(); return NULL; } static struct lowpan_peer *lookup_peer(struct l2cap_conn *conn) { struct lowpan_btle_dev *entry; struct lowpan_peer *peer = NULL; rcu_read_lock(); list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { peer = __peer_lookup_conn(entry, conn); if (peer) break; } rcu_read_unlock(); return peer; } static struct lowpan_btle_dev *lookup_dev(struct l2cap_conn *conn) { struct lowpan_btle_dev *entry; struct lowpan_btle_dev *dev = NULL; rcu_read_lock(); list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { if (conn->hcon->hdev == entry->hdev) { dev = entry; break; } } rcu_read_unlock(); return dev; } static int give_skb_to_upper(struct sk_buff *skb, struct net_device *dev) { struct sk_buff *skb_cp; skb_cp = skb_copy(skb, GFP_ATOMIC); if (!skb_cp) return NET_RX_DROP; return netif_rx(skb_cp); } static int iphc_decompress(struct sk_buff *skb, struct net_device *netdev, struct lowpan_peer *peer) { const u8 *saddr; saddr = peer->lladdr; return lowpan_header_decompress(skb, netdev, netdev->dev_addr, saddr); } static int recv_pkt(struct sk_buff *skb, struct net_device *dev, struct lowpan_peer *peer) { struct sk_buff *local_skb; int ret; if (!netif_running(dev)) goto drop; if (dev->type != ARPHRD_6LOWPAN || !skb->len) goto drop; skb_reset_network_header(skb); skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) goto drop; /* check that it's our buffer */ if (lowpan_is_ipv6(*skb_network_header(skb))) { /* Pull off the 1-byte of 6lowpan header. */ skb_pull(skb, 1); /* Copy the packet so that the IPv6 header is * properly aligned. */ local_skb = skb_copy_expand(skb, NET_SKB_PAD - 1, skb_tailroom(skb), GFP_ATOMIC); if (!local_skb) goto drop; local_skb->protocol = htons(ETH_P_IPV6); local_skb->pkt_type = PACKET_HOST; local_skb->dev = dev; skb_set_transport_header(local_skb, sizeof(struct ipv6hdr)); if (give_skb_to_upper(local_skb, dev) != NET_RX_SUCCESS) { kfree_skb(local_skb); goto drop; } dev->stats.rx_bytes += skb->len; dev->stats.rx_packets++; consume_skb(local_skb); consume_skb(skb); } else if (lowpan_is_iphc(*skb_network_header(skb))) { local_skb = skb_clone(skb, GFP_ATOMIC); if (!local_skb) goto drop; local_skb->dev = dev; ret = iphc_decompress(local_skb, dev, peer); if (ret < 0) { BT_DBG("iphc_decompress failed: %d", ret); kfree_skb(local_skb); goto drop; } local_skb->protocol = htons(ETH_P_IPV6); local_skb->pkt_type = PACKET_HOST; if (give_skb_to_upper(local_skb, dev) != NET_RX_SUCCESS) { kfree_skb(local_skb); goto drop; } dev->stats.rx_bytes += skb->len; dev->stats.rx_packets++; consume_skb(local_skb); consume_skb(skb); } else { BT_DBG("unknown packet type"); goto drop; } return NET_RX_SUCCESS; drop: dev->stats.rx_dropped++; return NET_RX_DROP; } /* Packet from BT LE device */ static int chan_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb) { struct lowpan_btle_dev *dev; struct lowpan_peer *peer; int err; peer = lookup_peer(chan->conn); if (!peer) return -ENOENT; dev = lookup_dev(chan->conn); if (!dev || !dev->netdev) return -ENOENT; err = recv_pkt(skb, dev->netdev, peer); if (err) { BT_DBG("recv pkt %d", err); err = -EAGAIN; } return err; } static int setup_header(struct sk_buff *skb, struct net_device *netdev, bdaddr_t *peer_addr, u8 *peer_addr_type) { struct in6_addr ipv6_daddr; struct ipv6hdr *hdr; struct lowpan_btle_dev *dev; struct lowpan_peer *peer; u8 *daddr; int err, status = 0; hdr = ipv6_hdr(skb); dev = lowpan_btle_dev(netdev); memcpy(&ipv6_daddr, &hdr->daddr, sizeof(ipv6_daddr)); if (ipv6_addr_is_multicast(&ipv6_daddr)) { lowpan_cb(skb)->chan = NULL; daddr = NULL; } else { BT_DBG("dest IP %pI6c", &ipv6_daddr); /* The packet might be sent to 6lowpan interface * because of routing (either via default route * or user set route) so get peer according to * the destination address. */ peer = peer_lookup_dst(dev, &ipv6_daddr, skb); if (!peer) { BT_DBG("no such peer"); return -ENOENT; } daddr = peer->lladdr; *peer_addr = peer->chan->dst; *peer_addr_type = peer->chan->dst_type; lowpan_cb(skb)->chan = peer->chan; status = 1; } lowpan_header_compress(skb, netdev, daddr, dev->netdev->dev_addr); err = dev_hard_header(skb, netdev, ETH_P_IPV6, NULL, NULL, 0); if (err < 0) return err; return status; } static int header_create(struct sk_buff *skb, struct net_device *netdev, unsigned short type, const void *_daddr, const void *_saddr, unsigned int len) { if (type != ETH_P_IPV6) return -EINVAL; return 0; } /* Packet to BT LE device */ static int send_pkt(struct l2cap_chan *chan, struct sk_buff *skb, struct net_device *netdev) { struct msghdr msg; struct kvec iv; int err; /* Remember the skb so that we can send EAGAIN to the caller if * we run out of credits. */ chan->data = skb; iv.iov_base = skb->data; iv.iov_len = skb->len; memset(&msg, 0, sizeof(msg)); iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iv, 1, skb->len); err = l2cap_chan_send(chan, &msg, skb->len); if (err > 0) { netdev->stats.tx_bytes += err; netdev->stats.tx_packets++; return 0; } if (err < 0) netdev->stats.tx_errors++; return err; } static int send_mcast_pkt(struct sk_buff *skb, struct net_device *netdev) { struct sk_buff *local_skb; struct lowpan_btle_dev *entry; int err = 0; rcu_read_lock(); list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { struct lowpan_peer *pentry; struct lowpan_btle_dev *dev; if (entry->netdev != netdev) continue; dev = lowpan_btle_dev(entry->netdev); list_for_each_entry_rcu(pentry, &dev->peers, list) { int ret; local_skb = skb_clone(skb, GFP_ATOMIC); BT_DBG("xmit %s to %pMR type %u IP %pI6c chan %p", netdev->name, &pentry->chan->dst, pentry->chan->dst_type, &pentry->peer_addr, pentry->chan); ret = send_pkt(pentry->chan, local_skb, netdev); if (ret < 0) err = ret; kfree_skb(local_skb); } } rcu_read_unlock(); return err; } static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev) { int err = 0; bdaddr_t addr; u8 addr_type; /* We must take a copy of the skb before we modify/replace the ipv6 * header as the header could be used elsewhere */ skb = skb_unshare(skb, GFP_ATOMIC); if (!skb) return NET_XMIT_DROP; /* Return values from setup_header() * <0 - error, packet is dropped * 0 - this is a multicast packet * 1 - this is unicast packet */ err = setup_header(skb, netdev, &addr, &addr_type); if (err < 0) { kfree_skb(skb); return NET_XMIT_DROP; } if (err) { if (lowpan_cb(skb)->chan) { BT_DBG("xmit %s to %pMR type %u IP %pI6c chan %p", netdev->name, &addr, addr_type, &lowpan_cb(skb)->addr, lowpan_cb(skb)->chan); err = send_pkt(lowpan_cb(skb)->chan, skb, netdev); } else { err = -ENOENT; } } else { /* We need to send the packet to every device behind this * interface. */ err = send_mcast_pkt(skb, netdev); } dev_kfree_skb(skb); if (err) BT_DBG("ERROR: xmit failed (%d)", err); return err < 0 ? NET_XMIT_DROP : err; } static int bt_dev_init(struct net_device *dev) { netdev_lockdep_set_classes(dev); return 0; } static const struct net_device_ops netdev_ops = { .ndo_init = bt_dev_init, .ndo_start_xmit = bt_xmit, }; static const struct header_ops header_ops = { .create = header_create, }; static void netdev_setup(struct net_device *dev) { dev->hard_header_len = 0; dev->needed_tailroom = 0; dev->flags = IFF_RUNNING | IFF_MULTICAST; dev->watchdog_timeo = 0; dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; dev->netdev_ops = &netdev_ops; dev->header_ops = &header_ops; dev->needs_free_netdev = true; } static const struct device_type bt_type = { .name = "bluetooth", }; static void ifup(struct net_device *netdev) { int err; rtnl_lock(); err = dev_open(netdev, NULL); if (err < 0) BT_INFO("iface %s cannot be opened (%d)", netdev->name, err); rtnl_unlock(); } static void ifdown(struct net_device *netdev) { rtnl_lock(); dev_close(netdev); rtnl_unlock(); } static void do_notify_peers(struct work_struct *work) { struct lowpan_btle_dev *dev = container_of(work, struct lowpan_btle_dev, notify_peers.work); netdev_notify_peers(dev->netdev); /* send neighbour adv at startup */ } static bool is_bt_6lowpan(struct hci_conn *hcon) { if (hcon->type != LE_LINK) return false; if (!enable_6lowpan) return false; return true; } static struct l2cap_chan *chan_create(void) { struct l2cap_chan *chan; chan = l2cap_chan_create(); if (!chan) return NULL; l2cap_chan_set_defaults(chan); chan->chan_type = L2CAP_CHAN_CONN_ORIENTED; chan->mode = L2CAP_MODE_LE_FLOWCTL; chan->imtu = 1280; return chan; } static struct l2cap_chan *add_peer_chan(struct l2cap_chan *chan, struct lowpan_btle_dev *dev, bool new_netdev) { struct lowpan_peer *peer; peer = kzalloc(sizeof(*peer), GFP_ATOMIC); if (!peer) return NULL; peer->chan = chan; baswap((void *)peer->lladdr, &chan->dst); lowpan_iphc_uncompress_eui48_lladdr(&peer->peer_addr, peer->lladdr); spin_lock(&devices_lock); INIT_LIST_HEAD(&peer->list); peer_add(dev, peer); spin_unlock(&devices_lock); /* Notifying peers about us needs to be done without locks held */ if (new_netdev) INIT_DELAYED_WORK(&dev->notify_peers, do_notify_peers); schedule_delayed_work(&dev->notify_peers, msecs_to_jiffies(100)); return peer->chan; } static int setup_netdev(struct l2cap_chan *chan, struct lowpan_btle_dev **dev) { struct net_device *netdev; bdaddr_t addr; int err; netdev = alloc_netdev(LOWPAN_PRIV_SIZE(sizeof(struct lowpan_btle_dev)), IFACE_NAME_TEMPLATE, NET_NAME_UNKNOWN, netdev_setup); if (!netdev) return -ENOMEM; netdev->addr_assign_type = NET_ADDR_PERM; baswap(&addr, &chan->src); __dev_addr_set(netdev, &addr, sizeof(addr)); netdev->netdev_ops = &netdev_ops; SET_NETDEV_DEV(netdev, &chan->conn->hcon->hdev->dev); SET_NETDEV_DEVTYPE(netdev, &bt_type); *dev = lowpan_btle_dev(netdev); (*dev)->netdev = netdev; (*dev)->hdev = chan->conn->hcon->hdev; INIT_LIST_HEAD(&(*dev)->peers); spin_lock(&devices_lock); INIT_LIST_HEAD(&(*dev)->list); list_add_rcu(&(*dev)->list, &bt_6lowpan_devices); spin_unlock(&devices_lock); err = lowpan_register_netdev(netdev, LOWPAN_LLTYPE_BTLE); if (err < 0) { BT_INFO("register_netdev failed %d", err); spin_lock(&devices_lock); list_del_rcu(&(*dev)->list); spin_unlock(&devices_lock); free_netdev(netdev); goto out; } BT_DBG("ifindex %d peer bdaddr %pMR type %d my addr %pMR type %d", netdev->ifindex, &chan->dst, chan->dst_type, &chan->src, chan->src_type); set_bit(__LINK_STATE_PRESENT, &netdev->state); return 0; out: return err; } static inline void chan_ready_cb(struct l2cap_chan *chan) { struct lowpan_btle_dev *dev; bool new_netdev = false; dev = lookup_dev(chan->conn); BT_DBG("chan %p conn %p dev %p", chan, chan->conn, dev); if (!dev) { if (setup_netdev(chan, &dev) < 0) { l2cap_chan_del(chan, -ENOENT); return; } new_netdev = true; } if (!try_module_get(THIS_MODULE)) return; add_peer_chan(chan, dev, new_netdev); ifup(dev->netdev); } static inline struct l2cap_chan *chan_new_conn_cb(struct l2cap_chan *pchan) { struct l2cap_chan *chan; chan = chan_create(); if (!chan) return NULL; chan->ops = pchan->ops; BT_DBG("chan %p pchan %p", chan, pchan); return chan; } static void delete_netdev(struct work_struct *work) { struct lowpan_btle_dev *entry = container_of(work, struct lowpan_btle_dev, delete_netdev); lowpan_unregister_netdev(entry->netdev); /* The entry pointer is deleted by the netdev destructor. */ } static void chan_close_cb(struct l2cap_chan *chan) { struct lowpan_btle_dev *entry; struct lowpan_btle_dev *dev = NULL; struct lowpan_peer *peer; int err = -ENOENT; bool last = false, remove = true; BT_DBG("chan %p conn %p", chan, chan->conn); if (chan->conn && chan->conn->hcon) { if (!is_bt_6lowpan(chan->conn->hcon)) return; /* If conn is set, then the netdev is also there and we should * not remove it. */ remove = false; } spin_lock(&devices_lock); list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { dev = lowpan_btle_dev(entry->netdev); peer = __peer_lookup_chan(dev, chan); if (peer) { last = peer_del(dev, peer); err = 0; BT_DBG("dev %p removing %speer %p", dev, last ? "last " : "1 ", peer); BT_DBG("chan %p orig refcnt %u", chan, kref_read(&chan->kref)); l2cap_chan_put(chan); break; } } if (!err && last && dev && !atomic_read(&dev->peer_count)) { spin_unlock(&devices_lock); cancel_delayed_work_sync(&dev->notify_peers); ifdown(dev->netdev); if (remove) { INIT_WORK(&entry->delete_netdev, delete_netdev); schedule_work(&entry->delete_netdev); } } else { spin_unlock(&devices_lock); } } static void chan_state_change_cb(struct l2cap_chan *chan, int state, int err) { BT_DBG("chan %p conn %p state %s err %d", chan, chan->conn, state_to_string(state), err); } static struct sk_buff *chan_alloc_skb_cb(struct l2cap_chan *chan, unsigned long hdr_len, unsigned long len, int nb) { /* Note that we must allocate using GFP_ATOMIC here as * this function is called originally from netdev hard xmit * function in atomic context. */ return bt_skb_alloc(hdr_len + len, GFP_ATOMIC); } static void chan_suspend_cb(struct l2cap_chan *chan) { struct lowpan_btle_dev *dev; BT_DBG("chan %p suspend", chan); dev = lookup_dev(chan->conn); if (!dev || !dev->netdev) return; netif_stop_queue(dev->netdev); } static void chan_resume_cb(struct l2cap_chan *chan) { struct lowpan_btle_dev *dev; BT_DBG("chan %p resume", chan); dev = lookup_dev(chan->conn); if (!dev || !dev->netdev) return; netif_wake_queue(dev->netdev); } static long chan_get_sndtimeo_cb(struct l2cap_chan *chan) { return L2CAP_CONN_TIMEOUT; } static const struct l2cap_ops bt_6lowpan_chan_ops = { .name = "L2CAP 6LoWPAN channel", .new_connection = chan_new_conn_cb, .recv = chan_recv_cb, .close = chan_close_cb, .state_change = chan_state_change_cb, .ready = chan_ready_cb, .resume = chan_resume_cb, .suspend = chan_suspend_cb, .get_sndtimeo = chan_get_sndtimeo_cb, .alloc_skb = chan_alloc_skb_cb, .teardown = l2cap_chan_no_teardown, .defer = l2cap_chan_no_defer, .set_shutdown = l2cap_chan_no_set_shutdown, }; static int bt_6lowpan_connect(bdaddr_t *addr, u8 dst_type) { struct l2cap_chan *chan; int err; chan = chan_create(); if (!chan) return -EINVAL; chan->ops = &bt_6lowpan_chan_ops; err = l2cap_chan_connect(chan, cpu_to_le16(L2CAP_PSM_IPSP), 0, addr, dst_type, L2CAP_CONN_TIMEOUT); BT_DBG("chan %p err %d", chan, err); if (err < 0) l2cap_chan_put(chan); return err; } static int bt_6lowpan_disconnect(struct l2cap_conn *conn, u8 dst_type) { struct lowpan_peer *peer; BT_DBG("conn %p dst type %u", conn, dst_type); peer = lookup_peer(conn); if (!peer) return -ENOENT; BT_DBG("peer %p chan %p", peer, peer->chan); l2cap_chan_close(peer->chan, ENOENT); return 0; } static struct l2cap_chan *bt_6lowpan_listen(void) { bdaddr_t *addr = BDADDR_ANY; struct l2cap_chan *chan; int err; if (!enable_6lowpan) return NULL; chan = chan_create(); if (!chan) return NULL; chan->ops = &bt_6lowpan_chan_ops; chan->state = BT_LISTEN; chan->src_type = BDADDR_LE_PUBLIC; atomic_set(&chan->nesting, L2CAP_NESTING_PARENT); BT_DBG("chan %p src type %u", chan, chan->src_type); err = l2cap_add_psm(chan, addr, cpu_to_le16(L2CAP_PSM_IPSP)); if (err) { l2cap_chan_put(chan); BT_ERR("psm cannot be added err %d", err); return NULL; } return chan; } static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type, struct l2cap_conn **conn) { struct hci_conn *hcon; struct hci_dev *hdev; int n; n = sscanf(buf, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx %hhu", &addr->b[5], &addr->b[4], &addr->b[3], &addr->b[2], &addr->b[1], &addr->b[0], addr_type); if (n < 7) return -EINVAL; /* The LE_PUBLIC address type is ignored because of BDADDR_ANY */ hdev = hci_get_route(addr, BDADDR_ANY, BDADDR_LE_PUBLIC); if (!hdev) return -ENOENT; hci_dev_lock(hdev); hcon = hci_conn_hash_lookup_le(hdev, addr, *addr_type); hci_dev_unlock(hdev); hci_dev_put(hdev); if (!hcon) return -ENOENT; *conn = (struct l2cap_conn *)hcon->l2cap_data; BT_DBG("conn %p dst %pMR type %u", *conn, &hcon->dst, hcon->dst_type); return 0; } static void disconnect_all_peers(void) { struct lowpan_btle_dev *entry; struct lowpan_peer *peer, *tmp_peer, *new_peer; struct list_head peers; INIT_LIST_HEAD(&peers); /* We make a separate list of peers as the close_cb() will * modify the device peers list so it is better not to mess * with the same list at the same time. */ rcu_read_lock(); list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { list_for_each_entry_rcu(peer, &entry->peers, list) { new_peer = kmalloc(sizeof(*new_peer), GFP_ATOMIC); if (!new_peer) break; new_peer->chan = peer->chan; INIT_LIST_HEAD(&new_peer->list); list_add(&new_peer->list, &peers); } } rcu_read_unlock(); spin_lock(&devices_lock); list_for_each_entry_safe(peer, tmp_peer, &peers, list) { l2cap_chan_close(peer->chan, ENOENT); list_del_rcu(&peer->list); kfree_rcu(peer, rcu); } spin_unlock(&devices_lock); } struct set_enable { struct work_struct work; bool flag; }; static void do_enable_set(struct work_struct *work) { struct set_enable *set_enable = container_of(work, struct set_enable, work); if (!set_enable->flag || enable_6lowpan != set_enable->flag) /* Disconnect existing connections if 6lowpan is * disabled */ disconnect_all_peers(); enable_6lowpan = set_enable->flag; mutex_lock(&set_lock); if (listen_chan) { l2cap_chan_close(listen_chan, 0); l2cap_chan_put(listen_chan); } listen_chan = bt_6lowpan_listen(); mutex_unlock(&set_lock); kfree(set_enable); } static int lowpan_enable_set(void *data, u64 val) { struct set_enable *set_enable; set_enable = kzalloc(sizeof(*set_enable), GFP_KERNEL); if (!set_enable) return -ENOMEM; set_enable->flag = !!val; INIT_WORK(&set_enable->work, do_enable_set); schedule_work(&set_enable->work); return 0; } static int lowpan_enable_get(void *data, u64 *val) { *val = enable_6lowpan; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(lowpan_enable_fops, lowpan_enable_get, lowpan_enable_set, "%llu\n"); static ssize_t lowpan_control_write(struct file *fp, const char __user *user_buffer, size_t count, loff_t *position) { char buf[32]; size_t buf_size = min(count, sizeof(buf) - 1); int ret; bdaddr_t addr; u8 addr_type; struct l2cap_conn *conn = NULL; if (copy_from_user(buf, user_buffer, buf_size)) return -EFAULT; buf[buf_size] = '\0'; if (memcmp(buf, "connect ", 8) == 0) { ret = get_l2cap_conn(&buf[8], &addr, &addr_type, &conn); if (ret == -EINVAL) return ret; mutex_lock(&set_lock); if (listen_chan) { l2cap_chan_close(listen_chan, 0); l2cap_chan_put(listen_chan); listen_chan = NULL; } mutex_unlock(&set_lock); if (conn) { struct lowpan_peer *peer; if (!is_bt_6lowpan(conn->hcon)) return -EINVAL; peer = lookup_peer(conn); if (peer) { BT_DBG("6LoWPAN connection already exists"); return -EALREADY; } BT_DBG("conn %p dst %pMR type %d user %u", conn, &conn->hcon->dst, conn->hcon->dst_type, addr_type); } ret = bt_6lowpan_connect(&addr, addr_type); if (ret < 0) return ret; return count; } if (memcmp(buf, "disconnect ", 11) == 0) { ret = get_l2cap_conn(&buf[11], &addr, &addr_type, &conn); if (ret < 0) return ret; ret = bt_6lowpan_disconnect(conn, addr_type); if (ret < 0) return ret; return count; } return count; } static int lowpan_control_show(struct seq_file *f, void *ptr) { struct lowpan_btle_dev *entry; struct lowpan_peer *peer; spin_lock(&devices_lock); list_for_each_entry(entry, &bt_6lowpan_devices, list) { list_for_each_entry(peer, &entry->peers, list) seq_printf(f, "%pMR (type %u)\n", &peer->chan->dst, peer->chan->dst_type); } spin_unlock(&devices_lock); return 0; } static int lowpan_control_open(struct inode *inode, struct file *file) { return single_open(file, lowpan_control_show, inode->i_private); } static const struct file_operations lowpan_control_fops = { .open = lowpan_control_open, .read = seq_read, .write = lowpan_control_write, .llseek = seq_lseek, .release = single_release, }; static void disconnect_devices(void) { struct lowpan_btle_dev *entry, *tmp, *new_dev; struct list_head devices; INIT_LIST_HEAD(&devices); /* We make a separate list of devices because the unregister_netdev() * will call device_event() which will also want to modify the same * devices list. */ rcu_read_lock(); list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { new_dev = kmalloc(sizeof(*new_dev), GFP_ATOMIC); if (!new_dev) break; new_dev->netdev = entry->netdev; INIT_LIST_HEAD(&new_dev->list); list_add_rcu(&new_dev->list, &devices); } rcu_read_unlock(); list_for_each_entry_safe(entry, tmp, &devices, list) { ifdown(entry->netdev); BT_DBG("Unregistering netdev %s %p", entry->netdev->name, entry->netdev); lowpan_unregister_netdev(entry->netdev); kfree(entry); } } static int device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); struct lowpan_btle_dev *entry; if (netdev->type != ARPHRD_6LOWPAN) return NOTIFY_DONE; switch (event) { case NETDEV_UNREGISTER: spin_lock(&devices_lock); list_for_each_entry(entry, &bt_6lowpan_devices, list) { if (entry->netdev == netdev) { BT_DBG("Unregistered netdev %s %p", netdev->name, netdev); list_del(&entry->list); break; } } spin_unlock(&devices_lock); break; } return NOTIFY_DONE; } static struct notifier_block bt_6lowpan_dev_notifier = { .notifier_call = device_event, }; static int __init bt_6lowpan_init(void) { lowpan_enable_debugfs = debugfs_create_file_unsafe("6lowpan_enable", 0644, bt_debugfs, NULL, &lowpan_enable_fops); lowpan_control_debugfs = debugfs_create_file("6lowpan_control", 0644, bt_debugfs, NULL, &lowpan_control_fops); return register_netdevice_notifier(&bt_6lowpan_dev_notifier); } static void __exit bt_6lowpan_exit(void) { debugfs_remove(lowpan_enable_debugfs); debugfs_remove(lowpan_control_debugfs); if (listen_chan) { l2cap_chan_close(listen_chan, 0); l2cap_chan_put(listen_chan); } disconnect_devices(); unregister_netdevice_notifier(&bt_6lowpan_dev_notifier); } module_init(bt_6lowpan_init); module_exit(bt_6lowpan_exit); MODULE_AUTHOR("Jukka Rissanen <jukka.rissanen@linux.intel.com>"); MODULE_DESCRIPTION("Bluetooth 6LoWPAN"); MODULE_VERSION(VERSION); MODULE_LICENSE("GPL"); |
9 4 4 5 5 14 3 11 4 3 5 2 8 6 6 5 5 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 | // SPDX-License-Identifier: GPL-2.0-or-later /* Public-key operation keyctls * * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/slab.h> #include <linux/err.h> #include <linux/key.h> #include <linux/keyctl.h> #include <linux/parser.h> #include <linux/uaccess.h> #include <keys/user-type.h> #include "internal.h" static void keyctl_pkey_params_free(struct kernel_pkey_params *params) { kfree(params->info); key_put(params->key); } enum { Opt_err, Opt_enc, /* "enc=<encoding>" eg. "enc=oaep" */ Opt_hash, /* "hash=<digest-name>" eg. "hash=sha1" */ }; static const match_table_t param_keys = { { Opt_enc, "enc=%s" }, { Opt_hash, "hash=%s" }, { Opt_err, NULL } }; /* * Parse the information string which consists of key=val pairs. */ static int keyctl_pkey_params_parse(struct kernel_pkey_params *params) { unsigned long token_mask = 0; substring_t args[MAX_OPT_ARGS]; char *c = params->info, *p, *q; int token; while ((p = strsep(&c, " \t"))) { if (*p == '\0' || *p == ' ' || *p == '\t') continue; token = match_token(p, param_keys, args); if (token == Opt_err) return -EINVAL; if (__test_and_set_bit(token, &token_mask)) return -EINVAL; q = args[0].from; if (!q[0]) return -EINVAL; switch (token) { case Opt_enc: params->encoding = q; break; case Opt_hash: params->hash_algo = q; break; default: return -EINVAL; } } return 0; } /* * Interpret parameters. Callers must always call the free function * on params, even if an error is returned. */ static int keyctl_pkey_params_get(key_serial_t id, const char __user *_info, struct kernel_pkey_params *params) { key_ref_t key_ref; void *p; int ret; memset(params, 0, sizeof(*params)); params->encoding = "raw"; p = strndup_user(_info, PAGE_SIZE); if (IS_ERR(p)) return PTR_ERR(p); params->info = p; ret = keyctl_pkey_params_parse(params); if (ret < 0) return ret; key_ref = lookup_user_key(id, 0, KEY_NEED_SEARCH); if (IS_ERR(key_ref)) return PTR_ERR(key_ref); params->key = key_ref_to_ptr(key_ref); if (!params->key->type->asym_query) return -EOPNOTSUPP; return 0; } /* * Get parameters from userspace. Callers must always call the free function * on params, even if an error is returned. */ static int keyctl_pkey_params_get_2(const struct keyctl_pkey_params __user *_params, const char __user *_info, int op, struct kernel_pkey_params *params) { struct keyctl_pkey_params uparams; struct kernel_pkey_query info; int ret; memset(params, 0, sizeof(*params)); params->encoding = "raw"; if (copy_from_user(&uparams, _params, sizeof(uparams)) != 0) return -EFAULT; ret = keyctl_pkey_params_get(uparams.key_id, _info, params); if (ret < 0) return ret; ret = params->key->type->asym_query(params, &info); if (ret < 0) return ret; switch (op) { case KEYCTL_PKEY_ENCRYPT: if (uparams.in_len > info.max_dec_size || uparams.out_len > info.max_enc_size) return -EINVAL; break; case KEYCTL_PKEY_DECRYPT: if (uparams.in_len > info.max_enc_size || uparams.out_len > info.max_dec_size) return -EINVAL; break; case KEYCTL_PKEY_SIGN: if (uparams.in_len > info.max_data_size || uparams.out_len > info.max_sig_size) return -EINVAL; break; case KEYCTL_PKEY_VERIFY: if (uparams.in_len > info.max_data_size || uparams.in2_len > info.max_sig_size) return -EINVAL; break; default: BUG(); } params->in_len = uparams.in_len; params->out_len = uparams.out_len; /* Note: same as in2_len */ return 0; } /* * Query information about an asymmetric key. */ long keyctl_pkey_query(key_serial_t id, const char __user *_info, struct keyctl_pkey_query __user *_res) { struct kernel_pkey_params params; struct kernel_pkey_query res; long ret; ret = keyctl_pkey_params_get(id, _info, ¶ms); if (ret < 0) goto error; ret = params.key->type->asym_query(¶ms, &res); if (ret < 0) goto error; ret = -EFAULT; if (copy_to_user(_res, &res, sizeof(res)) == 0 && clear_user(_res->__spare, sizeof(_res->__spare)) == 0) ret = 0; error: keyctl_pkey_params_free(¶ms); return ret; } /* * Encrypt/decrypt/sign * * Encrypt data, decrypt data or sign data using a public key. * * _info is a string of supplementary information in key=val format. For * instance, it might contain: * * "enc=pkcs1 hash=sha256" * * where enc= specifies the encoding and hash= selects the OID to go in that * particular encoding if required. If enc= isn't supplied, it's assumed that * the caller is supplying raw values. * * If successful, the amount of data written into the output buffer is * returned. */ long keyctl_pkey_e_d_s(int op, const struct keyctl_pkey_params __user *_params, const char __user *_info, const void __user *_in, void __user *_out) { struct kernel_pkey_params params; void *in, *out; long ret; ret = keyctl_pkey_params_get_2(_params, _info, op, ¶ms); if (ret < 0) goto error_params; ret = -EOPNOTSUPP; if (!params.key->type->asym_eds_op) goto error_params; switch (op) { case KEYCTL_PKEY_ENCRYPT: params.op = kernel_pkey_encrypt; break; case KEYCTL_PKEY_DECRYPT: params.op = kernel_pkey_decrypt; break; case KEYCTL_PKEY_SIGN: params.op = kernel_pkey_sign; break; default: BUG(); } in = memdup_user(_in, params.in_len); if (IS_ERR(in)) { ret = PTR_ERR(in); goto error_params; } ret = -ENOMEM; out = kmalloc(params.out_len, GFP_KERNEL); if (!out) goto error_in; ret = params.key->type->asym_eds_op(¶ms, in, out); if (ret < 0) goto error_out; if (copy_to_user(_out, out, ret) != 0) ret = -EFAULT; error_out: kfree(out); error_in: kfree(in); error_params: keyctl_pkey_params_free(¶ms); return ret; } /* * Verify a signature. * * Verify a public key signature using the given key, or if not given, search * for a matching key. * * _info is a string of supplementary information in key=val format. For * instance, it might contain: * * "enc=pkcs1 hash=sha256" * * where enc= specifies the signature blob encoding and hash= selects the OID * to go in that particular encoding. If enc= isn't supplied, it's assumed * that the caller is supplying raw values. * * If successful, 0 is returned. */ long keyctl_pkey_verify(const struct keyctl_pkey_params __user *_params, const char __user *_info, const void __user *_in, const void __user *_in2) { struct kernel_pkey_params params; void *in, *in2; long ret; ret = keyctl_pkey_params_get_2(_params, _info, KEYCTL_PKEY_VERIFY, ¶ms); if (ret < 0) goto error_params; ret = -EOPNOTSUPP; if (!params.key->type->asym_verify_signature) goto error_params; in = memdup_user(_in, params.in_len); if (IS_ERR(in)) { ret = PTR_ERR(in); goto error_params; } in2 = memdup_user(_in2, params.in2_len); if (IS_ERR(in2)) { ret = PTR_ERR(in2); goto error_in; } params.op = kernel_pkey_verify; ret = params.key->type->asym_verify_signature(¶ms, in, in2); kfree(in2); error_in: kfree(in); error_params: keyctl_pkey_params_free(¶ms); return ret; } |
19 19 1 1 17 15 12 1 1 1 1 2 1 1 2 2 4 1 2 1 2 1 1 2 1 1 1 2 3 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 | // SPDX-License-Identifier: GPL-2.0 /* Copyright 2011-2014 Autronica Fire and Security AS * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * * The HSR spec says never to forward the same frame twice on the same * interface. A frame is identified by its source MAC address and its HSR * sequence number. This code keeps track of senders and their sequence numbers * to allow filtering of duplicate frames, and to detect HSR ring errors. * Same code handles filtering of duplicates for PRP as well. */ #include <linux/if_ether.h> #include <linux/etherdevice.h> #include <linux/slab.h> #include <linux/rculist.h> #include "hsr_main.h" #include "hsr_framereg.h" #include "hsr_netlink.h" /* seq_nr_after(a, b) - return true if a is after (higher in sequence than) b, * false otherwise. */ static bool seq_nr_after(u16 a, u16 b) { /* Remove inconsistency where * seq_nr_after(a, b) == seq_nr_before(a, b) */ if ((int)b - a == 32768) return false; return (((s16)(b - a)) < 0); } #define seq_nr_before(a, b) seq_nr_after((b), (a)) #define seq_nr_before_or_eq(a, b) (!seq_nr_after((a), (b))) bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr) { struct hsr_self_node *sn; bool ret = false; rcu_read_lock(); sn = rcu_dereference(hsr->self_node); if (!sn) { WARN_ONCE(1, "HSR: No self node\n"); goto out; } if (ether_addr_equal(addr, sn->macaddress_A) || ether_addr_equal(addr, sn->macaddress_B)) ret = true; out: rcu_read_unlock(); return ret; } /* Search for mac entry. Caller must hold rcu read lock. */ static struct hsr_node *find_node_by_addr_A(struct list_head *node_db, const unsigned char addr[ETH_ALEN]) { struct hsr_node *node; list_for_each_entry_rcu(node, node_db, mac_list) { if (ether_addr_equal(node->macaddress_A, addr)) return node; } return NULL; } /* Helper for device init; the self_node is used in hsr_rcv() to recognize * frames from self that's been looped over the HSR ring. */ int hsr_create_self_node(struct hsr_priv *hsr, const unsigned char addr_a[ETH_ALEN], const unsigned char addr_b[ETH_ALEN]) { struct hsr_self_node *sn, *old; sn = kmalloc(sizeof(*sn), GFP_KERNEL); if (!sn) return -ENOMEM; ether_addr_copy(sn->macaddress_A, addr_a); ether_addr_copy(sn->macaddress_B, addr_b); spin_lock_bh(&hsr->list_lock); old = rcu_replace_pointer(hsr->self_node, sn, lockdep_is_held(&hsr->list_lock)); spin_unlock_bh(&hsr->list_lock); if (old) kfree_rcu(old, rcu_head); return 0; } void hsr_del_self_node(struct hsr_priv *hsr) { struct hsr_self_node *old; spin_lock_bh(&hsr->list_lock); old = rcu_replace_pointer(hsr->self_node, NULL, lockdep_is_held(&hsr->list_lock)); spin_unlock_bh(&hsr->list_lock); if (old) kfree_rcu(old, rcu_head); } void hsr_del_nodes(struct list_head *node_db) { struct hsr_node *node; struct hsr_node *tmp; list_for_each_entry_safe(node, tmp, node_db, mac_list) kfree(node); } void prp_handle_san_frame(bool san, enum hsr_port_type port, struct hsr_node *node) { /* Mark if the SAN node is over LAN_A or LAN_B */ if (port == HSR_PT_SLAVE_A) { node->san_a = true; return; } if (port == HSR_PT_SLAVE_B) node->san_b = true; } /* Allocate an hsr_node and add it to node_db. 'addr' is the node's address_A; * seq_out is used to initialize filtering of outgoing duplicate frames * originating from the newly added node. */ static struct hsr_node *hsr_add_node(struct hsr_priv *hsr, struct list_head *node_db, unsigned char addr[], u16 seq_out, bool san, enum hsr_port_type rx_port) { struct hsr_node *new_node, *node; unsigned long now; int i; new_node = kzalloc(sizeof(*new_node), GFP_ATOMIC); if (!new_node) return NULL; ether_addr_copy(new_node->macaddress_A, addr); spin_lock_init(&new_node->seq_out_lock); /* We are only interested in time diffs here, so use current jiffies * as initialization. (0 could trigger an spurious ring error warning). */ now = jiffies; for (i = 0; i < HSR_PT_PORTS; i++) { new_node->time_in[i] = now; new_node->time_out[i] = now; } for (i = 0; i < HSR_PT_PORTS; i++) new_node->seq_out[i] = seq_out; if (san && hsr->proto_ops->handle_san_frame) hsr->proto_ops->handle_san_frame(san, rx_port, new_node); spin_lock_bh(&hsr->list_lock); list_for_each_entry_rcu(node, node_db, mac_list, lockdep_is_held(&hsr->list_lock)) { if (ether_addr_equal(node->macaddress_A, addr)) goto out; if (ether_addr_equal(node->macaddress_B, addr)) goto out; } list_add_tail_rcu(&new_node->mac_list, node_db); spin_unlock_bh(&hsr->list_lock); return new_node; out: spin_unlock_bh(&hsr->list_lock); kfree(new_node); return node; } void prp_update_san_info(struct hsr_node *node, bool is_sup) { if (!is_sup) return; node->san_a = false; node->san_b = false; } /* Get the hsr_node from which 'skb' was sent. */ struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db, struct sk_buff *skb, bool is_sup, enum hsr_port_type rx_port) { struct hsr_priv *hsr = port->hsr; struct hsr_node *node; struct ethhdr *ethhdr; struct prp_rct *rct; bool san = false; u16 seq_out; if (!skb_mac_header_was_set(skb)) return NULL; ethhdr = (struct ethhdr *)skb_mac_header(skb); list_for_each_entry_rcu(node, node_db, mac_list) { if (ether_addr_equal(node->macaddress_A, ethhdr->h_source)) { if (hsr->proto_ops->update_san_info) hsr->proto_ops->update_san_info(node, is_sup); return node; } if (ether_addr_equal(node->macaddress_B, ethhdr->h_source)) { if (hsr->proto_ops->update_san_info) hsr->proto_ops->update_san_info(node, is_sup); return node; } } /* Everyone may create a node entry, connected node to a HSR/PRP * device. */ if (ethhdr->h_proto == htons(ETH_P_PRP) || ethhdr->h_proto == htons(ETH_P_HSR)) { /* Check if skb contains hsr_ethhdr */ if (skb->mac_len < sizeof(struct hsr_ethhdr)) return NULL; /* Use the existing sequence_nr from the tag as starting point * for filtering duplicate frames. */ seq_out = hsr_get_skb_sequence_nr(skb) - 1; } else { rct = skb_get_PRP_rct(skb); if (rct && prp_check_lsdu_size(skb, rct, is_sup)) { seq_out = prp_get_skb_sequence_nr(rct); } else { if (rx_port != HSR_PT_MASTER) san = true; seq_out = HSR_SEQNR_START; } } return hsr_add_node(hsr, node_db, ethhdr->h_source, seq_out, san, rx_port); } /* Use the Supervision frame's info about an eventual macaddress_B for merging * nodes that has previously had their macaddress_B registered as a separate * node. */ void hsr_handle_sup_frame(struct hsr_frame_info *frame) { struct hsr_node *node_curr = frame->node_src; struct hsr_port *port_rcv = frame->port_rcv; struct hsr_priv *hsr = port_rcv->hsr; struct hsr_sup_payload *hsr_sp; struct hsr_sup_tlv *hsr_sup_tlv; struct hsr_node *node_real; struct sk_buff *skb = NULL; struct list_head *node_db; struct ethhdr *ethhdr; int i; unsigned int pull_size = 0; unsigned int total_pull_size = 0; /* Here either frame->skb_hsr or frame->skb_prp should be * valid as supervision frame always will have protocol * header info. */ if (frame->skb_hsr) skb = frame->skb_hsr; else if (frame->skb_prp) skb = frame->skb_prp; else if (frame->skb_std) skb = frame->skb_std; if (!skb) return; /* Leave the ethernet header. */ pull_size = sizeof(struct ethhdr); skb_pull(skb, pull_size); total_pull_size += pull_size; ethhdr = (struct ethhdr *)skb_mac_header(skb); /* And leave the HSR tag. */ if (ethhdr->h_proto == htons(ETH_P_HSR)) { pull_size = sizeof(struct hsr_tag); skb_pull(skb, pull_size); total_pull_size += pull_size; } /* And leave the HSR sup tag. */ pull_size = sizeof(struct hsr_sup_tag); skb_pull(skb, pull_size); total_pull_size += pull_size; /* get HSR sup payload */ hsr_sp = (struct hsr_sup_payload *)skb->data; /* Merge node_curr (registered on macaddress_B) into node_real */ node_db = &port_rcv->hsr->node_db; node_real = find_node_by_addr_A(node_db, hsr_sp->macaddress_A); if (!node_real) /* No frame received from AddrA of this node yet */ node_real = hsr_add_node(hsr, node_db, hsr_sp->macaddress_A, HSR_SEQNR_START - 1, true, port_rcv->type); if (!node_real) goto done; /* No mem */ if (node_real == node_curr) /* Node has already been merged */ goto done; /* Leave the first HSR sup payload. */ pull_size = sizeof(struct hsr_sup_payload); skb_pull(skb, pull_size); total_pull_size += pull_size; /* Get second supervision tlv */ hsr_sup_tlv = (struct hsr_sup_tlv *)skb->data; /* And check if it is a redbox mac TLV */ if (hsr_sup_tlv->HSR_TLV_type == PRP_TLV_REDBOX_MAC) { /* We could stop here after pushing hsr_sup_payload, * or proceed and allow macaddress_B and for redboxes. */ /* Sanity check length */ if (hsr_sup_tlv->HSR_TLV_length != 6) goto done; /* Leave the second HSR sup tlv. */ pull_size = sizeof(struct hsr_sup_tlv); skb_pull(skb, pull_size); total_pull_size += pull_size; /* Get redbox mac address. */ hsr_sp = (struct hsr_sup_payload *)skb->data; /* Check if redbox mac and node mac are equal. */ if (!ether_addr_equal(node_real->macaddress_A, hsr_sp->macaddress_A)) { /* This is a redbox supervision frame for a VDAN! */ goto done; } } ether_addr_copy(node_real->macaddress_B, ethhdr->h_source); spin_lock_bh(&node_real->seq_out_lock); for (i = 0; i < HSR_PT_PORTS; i++) { if (!node_curr->time_in_stale[i] && time_after(node_curr->time_in[i], node_real->time_in[i])) { node_real->time_in[i] = node_curr->time_in[i]; node_real->time_in_stale[i] = node_curr->time_in_stale[i]; } if (seq_nr_after(node_curr->seq_out[i], node_real->seq_out[i])) node_real->seq_out[i] = node_curr->seq_out[i]; } spin_unlock_bh(&node_real->seq_out_lock); node_real->addr_B_port = port_rcv->type; spin_lock_bh(&hsr->list_lock); if (!node_curr->removed) { list_del_rcu(&node_curr->mac_list); node_curr->removed = true; kfree_rcu(node_curr, rcu_head); } spin_unlock_bh(&hsr->list_lock); done: /* Push back here */ skb_push(skb, total_pull_size); } /* 'skb' is a frame meant for this host, that is to be passed to upper layers. * * If the frame was sent by a node's B interface, replace the source * address with that node's "official" address (macaddress_A) so that upper * layers recognize where it came from. */ void hsr_addr_subst_source(struct hsr_node *node, struct sk_buff *skb) { if (!skb_mac_header_was_set(skb)) { WARN_ONCE(1, "%s: Mac header not set\n", __func__); return; } memcpy(ð_hdr(skb)->h_source, node->macaddress_A, ETH_ALEN); } /* 'skb' is a frame meant for another host. * 'port' is the outgoing interface * * Substitute the target (dest) MAC address if necessary, so the it matches the * recipient interface MAC address, regardless of whether that is the * recipient's A or B interface. * This is needed to keep the packets flowing through switches that learn on * which "side" the different interfaces are. */ void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb, struct hsr_port *port) { struct hsr_node *node_dst; if (!skb_mac_header_was_set(skb)) { WARN_ONCE(1, "%s: Mac header not set\n", __func__); return; } if (!is_unicast_ether_addr(eth_hdr(skb)->h_dest)) return; node_dst = find_node_by_addr_A(&port->hsr->node_db, eth_hdr(skb)->h_dest); if (!node_dst) { if (port->hsr->prot_version != PRP_V1 && net_ratelimit()) netdev_err(skb->dev, "%s: Unknown node\n", __func__); return; } if (port->type != node_dst->addr_B_port) return; if (is_valid_ether_addr(node_dst->macaddress_B)) ether_addr_copy(eth_hdr(skb)->h_dest, node_dst->macaddress_B); } void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port, u16 sequence_nr) { /* Don't register incoming frames without a valid sequence number. This * ensures entries of restarted nodes gets pruned so that they can * re-register and resume communications. */ if (!(port->dev->features & NETIF_F_HW_HSR_TAG_RM) && seq_nr_before(sequence_nr, node->seq_out[port->type])) return; node->time_in[port->type] = jiffies; node->time_in_stale[port->type] = false; } /* 'skb' is a HSR Ethernet frame (with a HSR tag inserted), with a valid * ethhdr->h_source address and skb->mac_header set. * * Return: * 1 if frame can be shown to have been sent recently on this interface, * 0 otherwise, or * negative error code on error */ int hsr_register_frame_out(struct hsr_port *port, struct hsr_node *node, u16 sequence_nr) { spin_lock_bh(&node->seq_out_lock); if (seq_nr_before_or_eq(sequence_nr, node->seq_out[port->type]) && time_is_after_jiffies(node->time_out[port->type] + msecs_to_jiffies(HSR_ENTRY_FORGET_TIME))) { spin_unlock_bh(&node->seq_out_lock); return 1; } node->time_out[port->type] = jiffies; node->seq_out[port->type] = sequence_nr; spin_unlock_bh(&node->seq_out_lock); return 0; } static struct hsr_port *get_late_port(struct hsr_priv *hsr, struct hsr_node *node) { if (node->time_in_stale[HSR_PT_SLAVE_A]) return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A); if (node->time_in_stale[HSR_PT_SLAVE_B]) return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); if (time_after(node->time_in[HSR_PT_SLAVE_B], node->time_in[HSR_PT_SLAVE_A] + msecs_to_jiffies(MAX_SLAVE_DIFF))) return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A); if (time_after(node->time_in[HSR_PT_SLAVE_A], node->time_in[HSR_PT_SLAVE_B] + msecs_to_jiffies(MAX_SLAVE_DIFF))) return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); return NULL; } /* Remove stale sequence_nr records. Called by timer every * HSR_LIFE_CHECK_INTERVAL (two seconds or so). */ void hsr_prune_nodes(struct timer_list *t) { struct hsr_priv *hsr = from_timer(hsr, t, prune_timer); struct hsr_node *node; struct hsr_node *tmp; struct hsr_port *port; unsigned long timestamp; unsigned long time_a, time_b; spin_lock_bh(&hsr->list_lock); list_for_each_entry_safe(node, tmp, &hsr->node_db, mac_list) { /* Don't prune own node. Neither time_in[HSR_PT_SLAVE_A] * nor time_in[HSR_PT_SLAVE_B], will ever be updated for * the master port. Thus the master node will be repeatedly * pruned leading to packet loss. */ if (hsr_addr_is_self(hsr, node->macaddress_A)) continue; /* Shorthand */ time_a = node->time_in[HSR_PT_SLAVE_A]; time_b = node->time_in[HSR_PT_SLAVE_B]; /* Check for timestamps old enough to risk wrap-around */ if (time_after(jiffies, time_a + MAX_JIFFY_OFFSET / 2)) node->time_in_stale[HSR_PT_SLAVE_A] = true; if (time_after(jiffies, time_b + MAX_JIFFY_OFFSET / 2)) node->time_in_stale[HSR_PT_SLAVE_B] = true; /* Get age of newest frame from node. * At least one time_in is OK here; nodes get pruned long * before both time_ins can get stale */ timestamp = time_a; if (node->time_in_stale[HSR_PT_SLAVE_A] || (!node->time_in_stale[HSR_PT_SLAVE_B] && time_after(time_b, time_a))) timestamp = time_b; /* Warn of ring error only as long as we get frames at all */ if (time_is_after_jiffies(timestamp + msecs_to_jiffies(1.5 * MAX_SLAVE_DIFF))) { rcu_read_lock(); port = get_late_port(hsr, node); if (port) hsr_nl_ringerror(hsr, node->macaddress_A, port); rcu_read_unlock(); } /* Prune old entries */ if (time_is_before_jiffies(timestamp + msecs_to_jiffies(HSR_NODE_FORGET_TIME))) { hsr_nl_nodedown(hsr, node->macaddress_A); if (!node->removed) { list_del_rcu(&node->mac_list); node->removed = true; /* Note that we need to free this entry later: */ kfree_rcu(node, rcu_head); } } } spin_unlock_bh(&hsr->list_lock); /* Restart timer */ mod_timer(&hsr->prune_timer, jiffies + msecs_to_jiffies(PRUNE_PERIOD)); } void *hsr_get_next_node(struct hsr_priv *hsr, void *_pos, unsigned char addr[ETH_ALEN]) { struct hsr_node *node; if (!_pos) { node = list_first_or_null_rcu(&hsr->node_db, struct hsr_node, mac_list); if (node) ether_addr_copy(addr, node->macaddress_A); return node; } node = _pos; list_for_each_entry_continue_rcu(node, &hsr->node_db, mac_list) { ether_addr_copy(addr, node->macaddress_A); return node; } return NULL; } int hsr_get_node_data(struct hsr_priv *hsr, const unsigned char *addr, unsigned char addr_b[ETH_ALEN], unsigned int *addr_b_ifindex, int *if1_age, u16 *if1_seq, int *if2_age, u16 *if2_seq) { struct hsr_node *node; struct hsr_port *port; unsigned long tdiff; node = find_node_by_addr_A(&hsr->node_db, addr); if (!node) return -ENOENT; ether_addr_copy(addr_b, node->macaddress_B); tdiff = jiffies - node->time_in[HSR_PT_SLAVE_A]; if (node->time_in_stale[HSR_PT_SLAVE_A]) *if1_age = INT_MAX; #if HZ <= MSEC_PER_SEC else if (tdiff > msecs_to_jiffies(INT_MAX)) *if1_age = INT_MAX; #endif else *if1_age = jiffies_to_msecs(tdiff); tdiff = jiffies - node->time_in[HSR_PT_SLAVE_B]; if (node->time_in_stale[HSR_PT_SLAVE_B]) *if2_age = INT_MAX; #if HZ <= MSEC_PER_SEC else if (tdiff > msecs_to_jiffies(INT_MAX)) *if2_age = INT_MAX; #endif else *if2_age = jiffies_to_msecs(tdiff); /* Present sequence numbers as if they were incoming on interface */ *if1_seq = node->seq_out[HSR_PT_SLAVE_B]; *if2_seq = node->seq_out[HSR_PT_SLAVE_A]; if (node->addr_B_port != HSR_PT_NONE) { port = hsr_port_get_hsr(hsr, node->addr_B_port); *addr_b_ifindex = port->dev->ifindex; } else { *addr_b_ifindex = -1; } return 0; } |
1744 1743 1536 85 1013 1398 876 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 | // SPDX-License-Identifier: GPL-2.0 /* * linux/lib/kasprintf.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/stdarg.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/string.h> /* Simplified asprintf. */ char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap) { unsigned int first, second; char *p; va_list aq; va_copy(aq, ap); first = vsnprintf(NULL, 0, fmt, aq); va_end(aq); p = kmalloc_track_caller(first+1, gfp); if (!p) return NULL; second = vsnprintf(p, first+1, fmt, ap); WARN(first != second, "different return values (%u and %u) from vsnprintf(\"%s\", ...)", first, second, fmt); return p; } EXPORT_SYMBOL(kvasprintf); /* * If fmt contains no % (or is exactly %s), use kstrdup_const. If fmt * (or the sole vararg) points to rodata, we will then save a memory * allocation and string copy. In any case, the return value should be * freed using kfree_const(). */ const char *kvasprintf_const(gfp_t gfp, const char *fmt, va_list ap) { if (!strchr(fmt, '%')) return kstrdup_const(fmt, gfp); if (!strcmp(fmt, "%s")) return kstrdup_const(va_arg(ap, const char*), gfp); return kvasprintf(gfp, fmt, ap); } EXPORT_SYMBOL(kvasprintf_const); char *kasprintf(gfp_t gfp, const char *fmt, ...) { va_list ap; char *p; va_start(ap, fmt); p = kvasprintf(gfp, fmt, ap); va_end(ap); return p; } EXPORT_SYMBOL(kasprintf); |
63 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PATH_H #define _LINUX_PATH_H struct dentry; struct vfsmount; struct path { struct vfsmount *mnt; struct dentry *dentry; } __randomize_layout; extern void path_get(const struct path *); extern void path_put(const struct path *); static inline int path_equal(const struct path *path1, const struct path *path2) { return path1->mnt == path2->mnt && path1->dentry == path2->dentry; } static inline void path_put_init(struct path *path) { path_put(path); *path = (struct path) { }; } #endif /* _LINUX_PATH_H */ |
3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 | // SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2003-2008 Takahiro Hirofuchi * Copyright (C) 2015-2016 Nobuo Iwata */ #include <linux/kthread.h> #include <linux/file.h> #include <linux/net.h> #include <linux/platform_device.h> #include <linux/slab.h> /* Hardening for Spectre-v1 */ #include <linux/nospec.h> #include "usbip_common.h" #include "vhci.h" /* TODO: refine locking ?*/ /* * output example: * hub port sta spd dev sockfd local_busid * hs 0000 004 000 00000000 000003 1-2.3 * ................................................ * ss 0008 004 000 00000000 000004 2-3.4 * ................................................ * * Output includes socket fd instead of socket pointer address to avoid * leaking kernel memory address in: * /sys/devices/platform/vhci_hcd.0/status and in debug output. * The socket pointer address is not used at the moment and it was made * visible as a convenient way to find IP address from socket pointer * address by looking up /proc/net/{tcp,tcp6}. As this opens a security * hole, the change is made to use sockfd instead. * */ static void port_show_vhci(char **out, int hub, int port, struct vhci_device *vdev) { if (hub == HUB_SPEED_HIGH) *out += sprintf(*out, "hs %04u %03u ", port, vdev->ud.status); else /* hub == HUB_SPEED_SUPER */ *out += sprintf(*out, "ss %04u %03u ", port, vdev->ud.status); if (vdev->ud.status == VDEV_ST_USED) { *out += sprintf(*out, "%03u %08x ", vdev->speed, vdev->devid); *out += sprintf(*out, "%06u %s", vdev->ud.sockfd, dev_name(&vdev->udev->dev)); } else { *out += sprintf(*out, "000 00000000 "); *out += sprintf(*out, "000000 0-0"); } *out += sprintf(*out, "\n"); } /* Sysfs entry to show port status */ static ssize_t status_show_vhci(int pdev_nr, char *out) { struct platform_device *pdev = vhcis[pdev_nr].pdev; struct vhci *vhci; struct usb_hcd *hcd; struct vhci_hcd *vhci_hcd; char *s = out; int i; unsigned long flags; if (!pdev || !out) { usbip_dbg_vhci_sysfs("show status error\n"); return 0; } hcd = platform_get_drvdata(pdev); vhci_hcd = hcd_to_vhci_hcd(hcd); vhci = vhci_hcd->vhci; spin_lock_irqsave(&vhci->lock, flags); for (i = 0; i < VHCI_HC_PORTS; i++) { struct vhci_device *vdev = &vhci->vhci_hcd_hs->vdev[i]; spin_lock(&vdev->ud.lock); port_show_vhci(&out, HUB_SPEED_HIGH, pdev_nr * VHCI_PORTS + i, vdev); spin_unlock(&vdev->ud.lock); } for (i = 0; i < VHCI_HC_PORTS; i++) { struct vhci_device *vdev = &vhci->vhci_hcd_ss->vdev[i]; spin_lock(&vdev->ud.lock); port_show_vhci(&out, HUB_SPEED_SUPER, pdev_nr * VHCI_PORTS + VHCI_HC_PORTS + i, vdev); spin_unlock(&vdev->ud.lock); } spin_unlock_irqrestore(&vhci->lock, flags); return out - s; } static ssize_t status_show_not_ready(int pdev_nr, char *out) { char *s = out; int i = 0; for (i = 0; i < VHCI_HC_PORTS; i++) { out += sprintf(out, "hs %04u %03u ", (pdev_nr * VHCI_PORTS) + i, VDEV_ST_NOTASSIGNED); out += sprintf(out, "000 00000000 0000000000000000 0-0"); out += sprintf(out, "\n"); } for (i = 0; i < VHCI_HC_PORTS; i++) { out += sprintf(out, "ss %04u %03u ", (pdev_nr * VHCI_PORTS) + VHCI_HC_PORTS + i, VDEV_ST_NOTASSIGNED); out += sprintf(out, "000 00000000 0000000000000000 0-0"); out += sprintf(out, "\n"); } return out - s; } static int status_name_to_id(const char *name) { char *c; long val; int ret; c = strchr(name, '.'); if (c == NULL) return 0; ret = kstrtol(c+1, 10, &val); if (ret < 0) return ret; return val; } static ssize_t status_show(struct device *dev, struct device_attribute *attr, char *out) { char *s = out; int pdev_nr; out += sprintf(out, "hub port sta spd dev sockfd local_busid\n"); pdev_nr = status_name_to_id(attr->attr.name); if (pdev_nr < 0) out += status_show_not_ready(pdev_nr, out); else out += status_show_vhci(pdev_nr, out); return out - s; } static ssize_t nports_show(struct device *dev, struct device_attribute *attr, char *out) { char *s = out; /* * Half the ports are for SPEED_HIGH and half for SPEED_SUPER, * thus the * 2. */ out += sprintf(out, "%d\n", VHCI_PORTS * vhci_num_controllers); return out - s; } static DEVICE_ATTR_RO(nports); /* Sysfs entry to shutdown a virtual connection */ static int vhci_port_disconnect(struct vhci_hcd *vhci_hcd, __u32 rhport) { struct vhci_device *vdev = &vhci_hcd->vdev[rhport]; struct vhci *vhci = vhci_hcd->vhci; unsigned long flags; usbip_dbg_vhci_sysfs("enter\n"); mutex_lock(&vdev->ud.sysfs_lock); /* lock */ spin_lock_irqsave(&vhci->lock, flags); spin_lock(&vdev->ud.lock); if (vdev->ud.status == VDEV_ST_NULL) { pr_err("not connected %d\n", vdev->ud.status); /* unlock */ spin_unlock(&vdev->ud.lock); spin_unlock_irqrestore(&vhci->lock, flags); mutex_unlock(&vdev->ud.sysfs_lock); return -EINVAL; } /* unlock */ spin_unlock(&vdev->ud.lock); spin_unlock_irqrestore(&vhci->lock, flags); usbip_event_add(&vdev->ud, VDEV_EVENT_DOWN); mutex_unlock(&vdev->ud.sysfs_lock); return 0; } static int valid_port(__u32 *pdev_nr, __u32 *rhport) { if (*pdev_nr >= vhci_num_controllers) { pr_err("pdev %u\n", *pdev_nr); return 0; } *pdev_nr = array_index_nospec(*pdev_nr, vhci_num_controllers); if (*rhport >= VHCI_HC_PORTS) { pr_err("rhport %u\n", *rhport); return 0; } *rhport = array_index_nospec(*rhport, VHCI_HC_PORTS); return 1; } static ssize_t detach_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { __u32 port = 0, pdev_nr = 0, rhport = 0; struct usb_hcd *hcd; struct vhci_hcd *vhci_hcd; int ret; if (kstrtoint(buf, 10, &port) < 0) return -EINVAL; pdev_nr = port_to_pdev_nr(port); rhport = port_to_rhport(port); if (!valid_port(&pdev_nr, &rhport)) return -EINVAL; hcd = platform_get_drvdata(vhcis[pdev_nr].pdev); if (hcd == NULL) { dev_err(dev, "port is not ready %u\n", port); return -EAGAIN; } usbip_dbg_vhci_sysfs("rhport %d\n", rhport); if ((port / VHCI_HC_PORTS) % 2) vhci_hcd = hcd_to_vhci_hcd(hcd)->vhci->vhci_hcd_ss; else vhci_hcd = hcd_to_vhci_hcd(hcd)->vhci->vhci_hcd_hs; ret = vhci_port_disconnect(vhci_hcd, rhport); if (ret < 0) return -EINVAL; usbip_dbg_vhci_sysfs("Leave\n"); return count; } static DEVICE_ATTR_WO(detach); static int valid_args(__u32 *pdev_nr, __u32 *rhport, enum usb_device_speed speed) { if (!valid_port(pdev_nr, rhport)) { return 0; } switch (speed) { case USB_SPEED_LOW: case USB_SPEED_FULL: case USB_SPEED_HIGH: case USB_SPEED_WIRELESS: case USB_SPEED_SUPER: break; default: pr_err("Failed attach request for unsupported USB speed: %s\n", usb_speed_string(speed)); return 0; } return 1; } /* Sysfs entry to establish a virtual connection */ /* * To start a new USB/IP attachment, a userland program needs to setup a TCP * connection and then write its socket descriptor with remote device * information into this sysfs file. * * A remote device is virtually attached to the root-hub port of @rhport with * @speed. @devid is embedded into a request to specify the remote device in a * server host. * * write() returns 0 on success, else negative errno. */ static ssize_t attach_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct socket *socket; int sockfd = 0; __u32 port = 0, pdev_nr = 0, rhport = 0, devid = 0, speed = 0; struct usb_hcd *hcd; struct vhci_hcd *vhci_hcd; struct vhci_device *vdev; struct vhci *vhci; int err; unsigned long flags; struct task_struct *tcp_rx = NULL; struct task_struct *tcp_tx = NULL; /* * @rhport: port number of vhci_hcd * @sockfd: socket descriptor of an established TCP connection * @devid: unique device identifier in a remote host * @speed: usb device speed in a remote host */ if (sscanf(buf, "%u %u %u %u", &port, &sockfd, &devid, &speed) != 4) return -EINVAL; pdev_nr = port_to_pdev_nr(port); rhport = port_to_rhport(port); usbip_dbg_vhci_sysfs("port(%u) pdev(%d) rhport(%u)\n", port, pdev_nr, rhport); usbip_dbg_vhci_sysfs("sockfd(%u) devid(%u) speed(%u)\n", sockfd, devid, speed); /* check received parameters */ if (!valid_args(&pdev_nr, &rhport, speed)) return -EINVAL; hcd = platform_get_drvdata(vhcis[pdev_nr].pdev); if (hcd == NULL) { dev_err(dev, "port %d is not ready\n", port); return -EAGAIN; } vhci_hcd = hcd_to_vhci_hcd(hcd); vhci = vhci_hcd->vhci; if (speed == USB_SPEED_SUPER) vdev = &vhci->vhci_hcd_ss->vdev[rhport]; else vdev = &vhci->vhci_hcd_hs->vdev[rhport]; mutex_lock(&vdev->ud.sysfs_lock); /* Extract socket from fd. */ socket = sockfd_lookup(sockfd, &err); if (!socket) { dev_err(dev, "failed to lookup sock"); err = -EINVAL; goto unlock_mutex; } if (socket->type != SOCK_STREAM) { dev_err(dev, "Expecting SOCK_STREAM - found %d", socket->type); sockfd_put(socket); err = -EINVAL; goto unlock_mutex; } /* create threads before locking */ tcp_rx = kthread_create(vhci_rx_loop, &vdev->ud, "vhci_rx"); if (IS_ERR(tcp_rx)) { sockfd_put(socket); err = -EINVAL; goto unlock_mutex; } tcp_tx = kthread_create(vhci_tx_loop, &vdev->ud, "vhci_tx"); if (IS_ERR(tcp_tx)) { kthread_stop(tcp_rx); sockfd_put(socket); err = -EINVAL; goto unlock_mutex; } /* get task structs now */ get_task_struct(tcp_rx); get_task_struct(tcp_tx); /* now begin lock until setting vdev status set */ spin_lock_irqsave(&vhci->lock, flags); spin_lock(&vdev->ud.lock); if (vdev->ud.status != VDEV_ST_NULL) { /* end of the lock */ spin_unlock(&vdev->ud.lock); spin_unlock_irqrestore(&vhci->lock, flags); sockfd_put(socket); kthread_stop_put(tcp_rx); kthread_stop_put(tcp_tx); dev_err(dev, "port %d already used\n", rhport); /* * Will be retried from userspace * if there's another free port. */ err = -EBUSY; goto unlock_mutex; } dev_info(dev, "pdev(%u) rhport(%u) sockfd(%d)\n", pdev_nr, rhport, sockfd); dev_info(dev, "devid(%u) speed(%u) speed_str(%s)\n", devid, speed, usb_speed_string(speed)); vdev->devid = devid; vdev->speed = speed; vdev->ud.sockfd = sockfd; vdev->ud.tcp_socket = socket; vdev->ud.tcp_rx = tcp_rx; vdev->ud.tcp_tx = tcp_tx; vdev->ud.status = VDEV_ST_NOTASSIGNED; usbip_kcov_handle_init(&vdev->ud); spin_unlock(&vdev->ud.lock); spin_unlock_irqrestore(&vhci->lock, flags); /* end the lock */ wake_up_process(vdev->ud.tcp_rx); wake_up_process(vdev->ud.tcp_tx); rh_port_connect(vdev, speed); dev_info(dev, "Device attached\n"); mutex_unlock(&vdev->ud.sysfs_lock); return count; unlock_mutex: mutex_unlock(&vdev->ud.sysfs_lock); return err; } static DEVICE_ATTR_WO(attach); #define MAX_STATUS_NAME 16 struct status_attr { struct device_attribute attr; char name[MAX_STATUS_NAME+1]; }; static struct status_attr *status_attrs; static void set_status_attr(int id) { struct status_attr *status; status = status_attrs + id; if (id == 0) strcpy(status->name, "status"); else snprintf(status->name, MAX_STATUS_NAME+1, "status.%d", id); status->attr.attr.name = status->name; status->attr.attr.mode = S_IRUGO; status->attr.show = status_show; sysfs_attr_init(&status->attr.attr); } static int init_status_attrs(void) { int id; status_attrs = kcalloc(vhci_num_controllers, sizeof(struct status_attr), GFP_KERNEL); if (status_attrs == NULL) return -ENOMEM; for (id = 0; id < vhci_num_controllers; id++) set_status_attr(id); return 0; } static void finish_status_attrs(void) { kfree(status_attrs); } struct attribute_group vhci_attr_group = { .attrs = NULL, }; int vhci_init_attr_group(void) { struct attribute **attrs; int ret, i; attrs = kcalloc((vhci_num_controllers + 5), sizeof(struct attribute *), GFP_KERNEL); if (attrs == NULL) return -ENOMEM; ret = init_status_attrs(); if (ret) { kfree(attrs); return ret; } *attrs = &dev_attr_nports.attr; *(attrs + 1) = &dev_attr_detach.attr; *(attrs + 2) = &dev_attr_attach.attr; *(attrs + 3) = &dev_attr_usbip_debug.attr; for (i = 0; i < vhci_num_controllers; i++) *(attrs + i + 4) = &((status_attrs + i)->attr.attr); vhci_attr_group.attrs = attrs; return 0; } void vhci_finish_attr_group(void) { finish_status_attrs(); kfree(vhci_attr_group.attrs); } |
24450 24441 24418 24418 24460 42 24418 24421 24580 6 1 1 1 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 | // SPDX-License-Identifier: GPL-2.0+ /* * Restartable sequences system call * * Copyright (C) 2015, Google, Inc., * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com> * Copyright (C) 2015-2018, EfficiOS Inc., * Mathieu Desnoyers <mathieu.desnoyers@efficios.com> */ #include <linux/sched.h> #include <linux/uaccess.h> #include <linux/syscalls.h> #include <linux/rseq.h> #include <linux/types.h> #include <asm/ptrace.h> #define CREATE_TRACE_POINTS #include <trace/events/rseq.h> /* The original rseq structure size (including padding) is 32 bytes. */ #define ORIG_RSEQ_SIZE 32 #define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) /* * * Restartable sequences are a lightweight interface that allows * user-level code to be executed atomically relative to scheduler * preemption and signal delivery. Typically used for implementing * per-cpu operations. * * It allows user-space to perform update operations on per-cpu data * without requiring heavy-weight atomic operations. * * Detailed algorithm of rseq user-space assembly sequences: * * init(rseq_cs) * cpu = TLS->rseq::cpu_id_start * [1] TLS->rseq::rseq_cs = rseq_cs * [start_ip] ---------------------------- * [2] if (cpu != TLS->rseq::cpu_id) * goto abort_ip; * [3] <last_instruction_in_cs> * [post_commit_ip] ---------------------------- * * The address of jump target abort_ip must be outside the critical * region, i.e.: * * [abort_ip] < [start_ip] || [abort_ip] >= [post_commit_ip] * * Steps [2]-[3] (inclusive) need to be a sequence of instructions in * userspace that can handle being interrupted between any of those * instructions, and then resumed to the abort_ip. * * 1. Userspace stores the address of the struct rseq_cs assembly * block descriptor into the rseq_cs field of the registered * struct rseq TLS area. This update is performed through a single * store within the inline assembly instruction sequence. * [start_ip] * * 2. Userspace tests to check whether the current cpu_id field match * the cpu number loaded before start_ip, branching to abort_ip * in case of a mismatch. * * If the sequence is preempted or interrupted by a signal * at or after start_ip and before post_commit_ip, then the kernel * clears TLS->__rseq_abi::rseq_cs, and sets the user-space return * ip to abort_ip before returning to user-space, so the preempted * execution resumes at abort_ip. * * 3. Userspace critical section final instruction before * post_commit_ip is the commit. The critical section is * self-terminating. * [post_commit_ip] * * 4. <success> * * On failure at [2], or if interrupted by preempt or signal delivery * between [1] and [3]: * * [abort_ip] * F1. <failure> */ static int rseq_update_cpu_node_id(struct task_struct *t) { struct rseq __user *rseq = t->rseq; u32 cpu_id = raw_smp_processor_id(); u32 node_id = cpu_to_node(cpu_id); u32 mm_cid = task_mm_cid(t); WARN_ON_ONCE((int) mm_cid < 0); if (!user_write_access_begin(rseq, t->rseq_len)) goto efault; unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end); unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end); unsafe_put_user(node_id, &rseq->node_id, efault_end); unsafe_put_user(mm_cid, &rseq->mm_cid, efault_end); /* * Additional feature fields added after ORIG_RSEQ_SIZE * need to be conditionally updated only if * t->rseq_len != ORIG_RSEQ_SIZE. */ user_write_access_end(); trace_rseq_update(t); return 0; efault_end: user_write_access_end(); efault: return -EFAULT; } static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) { u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, mm_cid = 0; /* * Reset cpu_id_start to its initial state (0). */ if (put_user(cpu_id_start, &t->rseq->cpu_id_start)) return -EFAULT; /* * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming * in after unregistration can figure out that rseq needs to be * registered again. */ if (put_user(cpu_id, &t->rseq->cpu_id)) return -EFAULT; /* * Reset node_id to its initial state (0). */ if (put_user(node_id, &t->rseq->node_id)) return -EFAULT; /* * Reset mm_cid to its initial state (0). */ if (put_user(mm_cid, &t->rseq->mm_cid)) return -EFAULT; /* * Additional feature fields added after ORIG_RSEQ_SIZE * need to be conditionally reset only if * t->rseq_len != ORIG_RSEQ_SIZE. */ return 0; } static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) { struct rseq_cs __user *urseq_cs; u64 ptr; u32 __user *usig; u32 sig; int ret; #ifdef CONFIG_64BIT if (get_user(ptr, &t->rseq->rseq_cs)) return -EFAULT; #else if (copy_from_user(&ptr, &t->rseq->rseq_cs, sizeof(ptr))) return -EFAULT; #endif if (!ptr) { memset(rseq_cs, 0, sizeof(*rseq_cs)); return 0; } if (ptr >= TASK_SIZE) return -EINVAL; urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) return -EFAULT; if (rseq_cs->start_ip >= TASK_SIZE || rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE || rseq_cs->abort_ip >= TASK_SIZE || rseq_cs->version > 0) return -EINVAL; /* Check for overflow. */ if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip) return -EINVAL; /* Ensure that abort_ip is not in the critical section. */ if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) return -EINVAL; usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32)); ret = get_user(sig, usig); if (ret) return ret; if (current->rseq_sig != sig) { printk_ratelimited(KERN_WARNING "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", sig, current->rseq_sig, current->pid, usig); return -EINVAL; } return 0; } static bool rseq_warn_flags(const char *str, u32 flags) { u32 test_flags; if (!flags) return false; test_flags = flags & RSEQ_CS_NO_RESTART_FLAGS; if (test_flags) pr_warn_once("Deprecated flags (%u) in %s ABI structure", test_flags, str); test_flags = flags & ~RSEQ_CS_NO_RESTART_FLAGS; if (test_flags) pr_warn_once("Unknown flags (%u) in %s ABI structure", test_flags, str); return true; } static int rseq_need_restart(struct task_struct *t, u32 cs_flags) { u32 flags, event_mask; int ret; if (rseq_warn_flags("rseq_cs", cs_flags)) return -EINVAL; /* Get thread flags. */ ret = get_user(flags, &t->rseq->flags); if (ret) return ret; if (rseq_warn_flags("rseq", flags)) return -EINVAL; /* * Load and clear event mask atomically with respect to * scheduler preemption. */ preempt_disable(); event_mask = t->rseq_event_mask; t->rseq_event_mask = 0; preempt_enable(); return !!event_mask; } static int clear_rseq_cs(struct task_struct *t) { /* * The rseq_cs field is set to NULL on preemption or signal * delivery on top of rseq assembly block, as well as on top * of code outside of the rseq assembly block. This performs * a lazy clear of the rseq_cs field. * * Set rseq_cs to NULL. */ #ifdef CONFIG_64BIT return put_user(0UL, &t->rseq->rseq_cs); #else if (clear_user(&t->rseq->rseq_cs, sizeof(t->rseq->rseq_cs))) return -EFAULT; return 0; #endif } /* * Unsigned comparison will be true when ip >= start_ip, and when * ip < start_ip + post_commit_offset. */ static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) { return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; } static int rseq_ip_fixup(struct pt_regs *regs) { unsigned long ip = instruction_pointer(regs); struct task_struct *t = current; struct rseq_cs rseq_cs; int ret; ret = rseq_get_rseq_cs(t, &rseq_cs); if (ret) return ret; /* * Handle potentially not being within a critical section. * If not nested over a rseq critical section, restart is useless. * Clear the rseq_cs pointer and return. */ if (!in_rseq_cs(ip, &rseq_cs)) return clear_rseq_cs(t); ret = rseq_need_restart(t, rseq_cs.flags); if (ret <= 0) return ret; ret = clear_rseq_cs(t); if (ret) return ret; trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, rseq_cs.abort_ip); instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip); return 0; } /* * This resume handler must always be executed between any of: * - preemption, * - signal delivery, * and return to user-space. * * This is how we can ensure that the entire rseq critical section * will issue the commit instruction only if executed atomically with * respect to other threads scheduled on the same CPU, and with respect * to signal handlers. */ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { struct task_struct *t = current; int ret, sig; if (unlikely(t->flags & PF_EXITING)) return; /* * regs is NULL if and only if the caller is in a syscall path. Skip * fixup and leave rseq_cs as is so that rseq_sycall() will detect and * kill a misbehaving userspace on debug kernels. */ if (regs) { ret = rseq_ip_fixup(regs); if (unlikely(ret < 0)) goto error; } if (unlikely(rseq_update_cpu_node_id(t))) goto error; return; error: sig = ksig ? ksig->sig : 0; force_sigsegv(sig); } #ifdef CONFIG_DEBUG_RSEQ /* * Terminate the process if a syscall is issued within a restartable * sequence. */ void rseq_syscall(struct pt_regs *regs) { unsigned long ip = instruction_pointer(regs); struct task_struct *t = current; struct rseq_cs rseq_cs; if (!t->rseq) return; if (rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) force_sig(SIGSEGV); } #endif /* * sys_rseq - setup restartable sequences for caller thread. */ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) { int ret; if (flags & RSEQ_FLAG_UNREGISTER) { if (flags & ~RSEQ_FLAG_UNREGISTER) return -EINVAL; /* Unregister rseq for current thread. */ if (current->rseq != rseq || !current->rseq) return -EINVAL; if (rseq_len != current->rseq_len) return -EINVAL; if (current->rseq_sig != sig) return -EPERM; ret = rseq_reset_rseq_cpu_node_id(current); if (ret) return ret; current->rseq = NULL; current->rseq_sig = 0; current->rseq_len = 0; return 0; } if (unlikely(flags)) return -EINVAL; if (current->rseq) { /* * If rseq is already registered, check whether * the provided address differs from the prior * one. */ if (current->rseq != rseq || rseq_len != current->rseq_len) return -EINVAL; if (current->rseq_sig != sig) return -EPERM; /* Already registered. */ return -EBUSY; } /* * If there was no rseq previously registered, ensure the provided rseq * is properly aligned, as communcated to user-space through the ELF * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq * size, the required alignment is the original struct rseq alignment. * * In order to be valid, rseq_len is either the original rseq size, or * large enough to contain all supported fields, as communicated to * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. */ if (rseq_len < ORIG_RSEQ_SIZE || (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) || (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || rseq_len < offsetof(struct rseq, end)))) return -EINVAL; if (!access_ok(rseq, rseq_len)) return -EFAULT; current->rseq = rseq; current->rseq_len = rseq_len; current->rseq_sig = sig; /* * If rseq was previously inactive, and has just been * registered, ensure the cpu_id_start and cpu_id fields * are updated before returning to user-space. */ rseq_set_notify_resume(current); return 0; } |
3 3 3 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 | // SPDX-License-Identifier: GPL-2.0-or-later /* * cgroups support for the BFQ I/O scheduler. */ #include <linux/module.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/cgroup.h> #include <linux/ktime.h> #include <linux/rbtree.h> #include <linux/ioprio.h> #include <linux/sbitmap.h> #include <linux/delay.h> #include "elevator.h" #include "bfq-iosched.h" #ifdef CONFIG_BFQ_CGROUP_DEBUG static int bfq_stat_init(struct bfq_stat *stat, gfp_t gfp) { int ret; ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp); if (ret) return ret; atomic64_set(&stat->aux_cnt, 0); return 0; } static void bfq_stat_exit(struct bfq_stat *stat) { percpu_counter_destroy(&stat->cpu_cnt); } /** * bfq_stat_add - add a value to a bfq_stat * @stat: target bfq_stat * @val: value to add * * Add @val to @stat. The caller must ensure that IRQ on the same CPU * don't re-enter this function for the same counter. */ static inline void bfq_stat_add(struct bfq_stat *stat, uint64_t val) { percpu_counter_add_batch(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH); } /** * bfq_stat_read - read the current value of a bfq_stat * @stat: bfq_stat to read */ static inline uint64_t bfq_stat_read(struct bfq_stat *stat) { return percpu_counter_sum_positive(&stat->cpu_cnt); } /** * bfq_stat_reset - reset a bfq_stat * @stat: bfq_stat to reset */ static inline void bfq_stat_reset(struct bfq_stat *stat) { percpu_counter_set(&stat->cpu_cnt, 0); atomic64_set(&stat->aux_cnt, 0); } /** * bfq_stat_add_aux - add a bfq_stat into another's aux count * @to: the destination bfq_stat * @from: the source * * Add @from's count including the aux one to @to's aux count. */ static inline void bfq_stat_add_aux(struct bfq_stat *to, struct bfq_stat *from) { atomic64_add(bfq_stat_read(from) + atomic64_read(&from->aux_cnt), &to->aux_cnt); } /** * blkg_prfill_stat - prfill callback for bfq_stat * @sf: seq_file to print to * @pd: policy private data of interest * @off: offset to the bfq_stat in @pd * * prfill callback for printing a bfq_stat. */ static u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off) { return __blkg_prfill_u64(sf, pd, bfq_stat_read((void *)pd + off)); } /* bfqg stats flags */ enum bfqg_stats_flags { BFQG_stats_waiting = 0, BFQG_stats_idling, BFQG_stats_empty, }; #define BFQG_FLAG_FNS(name) \ static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \ { \ stats->flags |= (1 << BFQG_stats_##name); \ } \ static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \ { \ stats->flags &= ~(1 << BFQG_stats_##name); \ } \ static int bfqg_stats_##name(struct bfqg_stats *stats) \ { \ return (stats->flags & (1 << BFQG_stats_##name)) != 0; \ } \ BFQG_FLAG_FNS(waiting) BFQG_FLAG_FNS(idling) BFQG_FLAG_FNS(empty) #undef BFQG_FLAG_FNS /* This should be called with the scheduler lock held. */ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) { u64 now; if (!bfqg_stats_waiting(stats)) return; now = blk_time_get_ns(); if (now > stats->start_group_wait_time) bfq_stat_add(&stats->group_wait_time, now - stats->start_group_wait_time); bfqg_stats_clear_waiting(stats); } /* This should be called with the scheduler lock held. */ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, struct bfq_group *curr_bfqg) { struct bfqg_stats *stats = &bfqg->stats; if (bfqg_stats_waiting(stats)) return; if (bfqg == curr_bfqg) return; stats->start_group_wait_time = blk_time_get_ns(); bfqg_stats_mark_waiting(stats); } /* This should be called with the scheduler lock held. */ static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { u64 now; if (!bfqg_stats_empty(stats)) return; now = blk_time_get_ns(); if (now > stats->start_empty_time) bfq_stat_add(&stats->empty_time, now - stats->start_empty_time); bfqg_stats_clear_empty(stats); } void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { bfq_stat_add(&bfqg->stats.dequeue, 1); } void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { struct bfqg_stats *stats = &bfqg->stats; if (blkg_rwstat_total(&stats->queued)) return; /* * group is already marked empty. This can happen if bfqq got new * request in parent group and moved to this group while being added * to service tree. Just ignore the event and move on. */ if (bfqg_stats_empty(stats)) return; stats->start_empty_time = blk_time_get_ns(); bfqg_stats_mark_empty(stats); } void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { struct bfqg_stats *stats = &bfqg->stats; if (bfqg_stats_idling(stats)) { u64 now = blk_time_get_ns(); if (now > stats->start_idle_time) bfq_stat_add(&stats->idle_time, now - stats->start_idle_time); bfqg_stats_clear_idling(stats); } } void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { struct bfqg_stats *stats = &bfqg->stats; stats->start_idle_time = blk_time_get_ns(); bfqg_stats_mark_idling(stats); } void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { struct bfqg_stats *stats = &bfqg->stats; bfq_stat_add(&stats->avg_queue_size_sum, blkg_rwstat_total(&stats->queued)); bfq_stat_add(&stats->avg_queue_size_samples, 1); bfqg_stats_update_group_wait_time(stats); } void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, blk_opf_t opf) { blkg_rwstat_add(&bfqg->stats.queued, opf, 1); bfqg_stats_end_empty_time(&bfqg->stats); if (!(bfqq == bfqg->bfqd->in_service_queue)) bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); } void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf) { blkg_rwstat_add(&bfqg->stats.queued, opf, -1); } void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf) { blkg_rwstat_add(&bfqg->stats.merged, opf, 1); } void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, u64 io_start_time_ns, blk_opf_t opf) { struct bfqg_stats *stats = &bfqg->stats; u64 now = blk_time_get_ns(); if (now > io_start_time_ns) blkg_rwstat_add(&stats->service_time, opf, now - io_start_time_ns); if (io_start_time_ns > start_time_ns) blkg_rwstat_add(&stats->wait_time, opf, io_start_time_ns - start_time_ns); } #else /* CONFIG_BFQ_CGROUP_DEBUG */ void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf) { } void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf) { } void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, u64 io_start_time_ns, blk_opf_t opf) { } void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } #endif /* CONFIG_BFQ_CGROUP_DEBUG */ #ifdef CONFIG_BFQ_GROUP_IOSCHED /* * blk-cgroup policy-related handlers * The following functions help in converting between blk-cgroup * internal structures and BFQ-specific structures. */ static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd) { return pd ? container_of(pd, struct bfq_group, pd) : NULL; } struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg) { return pd_to_blkg(&bfqg->pd); } static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg) { return pd_to_bfqg(blkg_to_pd(blkg, &blkcg_policy_bfq)); } /* * bfq_group handlers * The following functions help in navigating the bfq_group hierarchy * by allowing to find the parent of a bfq_group or the bfq_group * associated to a bfq_queue. */ static struct bfq_group *bfqg_parent(struct bfq_group *bfqg) { struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent; return pblkg ? blkg_to_bfqg(pblkg) : NULL; } struct bfq_group *bfqq_group(struct bfq_queue *bfqq) { struct bfq_entity *group_entity = bfqq->entity.parent; return group_entity ? container_of(group_entity, struct bfq_group, entity) : bfqq->bfqd->root_group; } /* * The following two functions handle get and put of a bfq_group by * wrapping the related blk-cgroup hooks. */ static void bfqg_get(struct bfq_group *bfqg) { refcount_inc(&bfqg->ref); } static void bfqg_put(struct bfq_group *bfqg) { if (refcount_dec_and_test(&bfqg->ref)) kfree(bfqg); } static void bfqg_and_blkg_get(struct bfq_group *bfqg) { /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */ bfqg_get(bfqg); blkg_get(bfqg_to_blkg(bfqg)); } void bfqg_and_blkg_put(struct bfq_group *bfqg) { blkg_put(bfqg_to_blkg(bfqg)); bfqg_put(bfqg); } void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq) { struct bfq_group *bfqg = blkg_to_bfqg(rq->bio->bi_blkg); if (!bfqg) return; blkg_rwstat_add(&bfqg->stats.bytes, rq->cmd_flags, blk_rq_bytes(rq)); blkg_rwstat_add(&bfqg->stats.ios, rq->cmd_flags, 1); } /* @stats = 0 */ static void bfqg_stats_reset(struct bfqg_stats *stats) { #ifdef CONFIG_BFQ_CGROUP_DEBUG /* queued stats shouldn't be cleared */ blkg_rwstat_reset(&stats->merged); blkg_rwstat_reset(&stats->service_time); blkg_rwstat_reset(&stats->wait_time); bfq_stat_reset(&stats->time); bfq_stat_reset(&stats->avg_queue_size_sum); bfq_stat_reset(&stats->avg_queue_size_samples); bfq_stat_reset(&stats->dequeue); bfq_stat_reset(&stats->group_wait_time); bfq_stat_reset(&stats->idle_time); bfq_stat_reset(&stats->empty_time); #endif } /* @to += @from */ static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) { if (!to || !from) return; #ifdef CONFIG_BFQ_CGROUP_DEBUG /* queued stats shouldn't be cleared */ blkg_rwstat_add_aux(&to->merged, &from->merged); blkg_rwstat_add_aux(&to->service_time, &from->service_time); blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); bfq_stat_add_aux(&from->time, &from->time); bfq_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); bfq_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples); bfq_stat_add_aux(&to->dequeue, &from->dequeue); bfq_stat_add_aux(&to->group_wait_time, &from->group_wait_time); bfq_stat_add_aux(&to->idle_time, &from->idle_time); bfq_stat_add_aux(&to->empty_time, &from->empty_time); #endif } /* * Transfer @bfqg's stats to its parent's aux counts so that the ancestors' * recursive stats can still account for the amount used by this bfqg after * it's gone. */ static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) { struct bfq_group *parent; if (!bfqg) /* root_group */ return; parent = bfqg_parent(bfqg); lockdep_assert_held(&bfqg_to_blkg(bfqg)->q->queue_lock); if (unlikely(!parent)) return; bfqg_stats_add_aux(&parent->stats, &bfqg->stats); bfqg_stats_reset(&bfqg->stats); } void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg) { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); entity->weight = entity->new_weight; entity->orig_weight = entity->new_weight; if (bfqq) { bfqq->ioprio = bfqq->new_ioprio; bfqq->ioprio_class = bfqq->new_ioprio_class; /* * Make sure that bfqg and its associated blkg do not * disappear before entity. */ bfqg_and_blkg_get(bfqg); } entity->parent = bfqg->my_entity; /* NULL for root group */ entity->sched_data = &bfqg->sched_data; } static void bfqg_stats_exit(struct bfqg_stats *stats) { blkg_rwstat_exit(&stats->bytes); blkg_rwstat_exit(&stats->ios); #ifdef CONFIG_BFQ_CGROUP_DEBUG blkg_rwstat_exit(&stats->merged); blkg_rwstat_exit(&stats->service_time); blkg_rwstat_exit(&stats->wait_time); blkg_rwstat_exit(&stats->queued); bfq_stat_exit(&stats->time); bfq_stat_exit(&stats->avg_queue_size_sum); bfq_stat_exit(&stats->avg_queue_size_samples); bfq_stat_exit(&stats->dequeue); bfq_stat_exit(&stats->group_wait_time); bfq_stat_exit(&stats->idle_time); bfq_stat_exit(&stats->empty_time); #endif } static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) { if (blkg_rwstat_init(&stats->bytes, gfp) || blkg_rwstat_init(&stats->ios, gfp)) goto error; #ifdef CONFIG_BFQ_CGROUP_DEBUG if (blkg_rwstat_init(&stats->merged, gfp) || blkg_rwstat_init(&stats->service_time, gfp) || blkg_rwstat_init(&stats->wait_time, gfp) || blkg_rwstat_init(&stats->queued, gfp) || bfq_stat_init(&stats->time, gfp) || bfq_stat_init(&stats->avg_queue_size_sum, gfp) || bfq_stat_init(&stats->avg_queue_size_samples, gfp) || bfq_stat_init(&stats->dequeue, gfp) || bfq_stat_init(&stats->group_wait_time, gfp) || bfq_stat_init(&stats->idle_time, gfp) || bfq_stat_init(&stats->empty_time, gfp)) goto error; #endif return 0; error: bfqg_stats_exit(stats); return -ENOMEM; } static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) { return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL; } static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) { return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); } static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) { struct bfq_group_data *bgd; bgd = kzalloc(sizeof(*bgd), gfp); if (!bgd) return NULL; bgd->weight = CGROUP_WEIGHT_DFL; return &bgd->pd; } static void bfq_cpd_free(struct blkcg_policy_data *cpd) { kfree(cpd_to_bfqgd(cpd)); } static struct blkg_policy_data *bfq_pd_alloc(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp) { struct bfq_group *bfqg; bfqg = kzalloc_node(sizeof(*bfqg), gfp, disk->node_id); if (!bfqg) return NULL; if (bfqg_stats_init(&bfqg->stats, gfp)) { kfree(bfqg); return NULL; } /* see comments in bfq_bic_update_cgroup for why refcounting */ refcount_set(&bfqg->ref, 1); return &bfqg->pd; } static void bfq_pd_init(struct blkg_policy_data *pd) { struct blkcg_gq *blkg = pd_to_blkg(pd); struct bfq_group *bfqg = blkg_to_bfqg(blkg); struct bfq_data *bfqd = blkg->q->elevator->elevator_data; struct bfq_entity *entity = &bfqg->entity; struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg); entity->orig_weight = entity->weight = entity->new_weight = d->weight; entity->my_sched_data = &bfqg->sched_data; entity->last_bfqq_created = NULL; bfqg->my_entity = entity; /* * the root_group's will be set to NULL * in bfq_init_queue() */ bfqg->bfqd = bfqd; bfqg->active_entities = 0; bfqg->num_queues_with_pending_reqs = 0; bfqg->rq_pos_tree = RB_ROOT; } static void bfq_pd_free(struct blkg_policy_data *pd) { struct bfq_group *bfqg = pd_to_bfqg(pd); bfqg_stats_exit(&bfqg->stats); bfqg_put(bfqg); } static void bfq_pd_reset_stats(struct blkg_policy_data *pd) { struct bfq_group *bfqg = pd_to_bfqg(pd); bfqg_stats_reset(&bfqg->stats); } static void bfq_group_set_parent(struct bfq_group *bfqg, struct bfq_group *parent) { struct bfq_entity *entity; entity = &bfqg->entity; entity->parent = parent->my_entity; entity->sched_data = &parent->sched_data; } static void bfq_link_bfqg(struct bfq_data *bfqd, struct bfq_group *bfqg) { struct bfq_group *parent; struct bfq_entity *entity; /* * Update chain of bfq_groups as we might be handling a leaf group * which, along with some of its relatives, has not been hooked yet * to the private hierarchy of BFQ. */ entity = &bfqg->entity; for_each_entity(entity) { struct bfq_group *curr_bfqg = container_of(entity, struct bfq_group, entity); if (curr_bfqg != bfqd->root_group) { parent = bfqg_parent(curr_bfqg); if (!parent) parent = bfqd->root_group; bfq_group_set_parent(curr_bfqg, parent); } } } struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio) { struct blkcg_gq *blkg = bio->bi_blkg; struct bfq_group *bfqg; while (blkg) { if (!blkg->online) { blkg = blkg->parent; continue; } bfqg = blkg_to_bfqg(blkg); if (bfqg->pd.online) { bio_associate_blkg_from_css(bio, &blkg->blkcg->css); return bfqg; } blkg = blkg->parent; } bio_associate_blkg_from_css(bio, &bfqg_to_blkg(bfqd->root_group)->blkcg->css); return bfqd->root_group; } /** * bfq_bfqq_move - migrate @bfqq to @bfqg. * @bfqd: queue descriptor. * @bfqq: the queue to move. * @bfqg: the group to move to. * * Move @bfqq to @bfqg, deactivating it from its old group and reactivating * it on the new one. Avoid putting the entity on the old group idle tree. * * Must be called under the scheduler lock, to make sure that the blkg * owning @bfqg does not disappear (see comments in * bfq_bic_update_cgroup on guaranteeing the consistency of blkg * objects). */ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_group *bfqg) { struct bfq_entity *entity = &bfqq->entity; struct bfq_group *old_parent = bfqq_group(bfqq); bool has_pending_reqs = false; /* * No point to move bfqq to the same group, which can happen when * root group is offlined */ if (old_parent == bfqg) return; /* * oom_bfqq is not allowed to move, oom_bfqq will hold ref to root_group * until elevator exit. */ if (bfqq == &bfqd->oom_bfqq) return; /* * Get extra reference to prevent bfqq from being freed in * next possible expire or deactivate. */ bfqq->ref++; if (entity->in_groups_with_pending_reqs) { has_pending_reqs = true; bfq_del_bfqq_in_groups_with_pending_reqs(bfqq); } /* If bfqq is empty, then bfq_bfqq_expire also invokes * bfq_del_bfqq_busy, thereby removing bfqq and its entity * from data structures related to current group. Otherwise we * need to remove bfqq explicitly with bfq_deactivate_bfqq, as * we do below. */ if (bfqq == bfqd->in_service_queue) bfq_bfqq_expire(bfqd, bfqd->in_service_queue, false, BFQQE_PREEMPTED); if (bfq_bfqq_busy(bfqq)) bfq_deactivate_bfqq(bfqd, bfqq, false, false); else if (entity->on_st_or_in_serv) bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); bfqg_and_blkg_put(old_parent); if (entity->parent && entity->parent->last_bfqq_created == bfqq) entity->parent->last_bfqq_created = NULL; else if (bfqd->last_bfqq_created == bfqq) bfqd->last_bfqq_created = NULL; entity->parent = bfqg->my_entity; entity->sched_data = &bfqg->sched_data; /* pin down bfqg and its associated blkg */ bfqg_and_blkg_get(bfqg); if (has_pending_reqs) bfq_add_bfqq_in_groups_with_pending_reqs(bfqq); if (bfq_bfqq_busy(bfqq)) { if (unlikely(!bfqd->nonrot_with_queueing)) bfq_pos_tree_add_move(bfqd, bfqq); bfq_activate_bfqq(bfqd, bfqq); } if (!bfqd->in_service_queue && !bfqd->tot_rq_in_driver) bfq_schedule_dispatch(bfqd); /* release extra ref taken above, bfqq may happen to be freed now */ bfq_put_queue(bfqq); } static void bfq_sync_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *sync_bfqq, struct bfq_io_cq *bic, struct bfq_group *bfqg, unsigned int act_idx) { struct bfq_queue *bfqq; if (!sync_bfqq->new_bfqq && !bfq_bfqq_coop(sync_bfqq)) { /* We are the only user of this bfqq, just move it */ if (sync_bfqq->entity.sched_data != &bfqg->sched_data) bfq_bfqq_move(bfqd, sync_bfqq, bfqg); return; } /* * The queue was merged to a different queue. Check * that the merge chain still belongs to the same * cgroup. */ for (bfqq = sync_bfqq; bfqq; bfqq = bfqq->new_bfqq) if (bfqq->entity.sched_data != &bfqg->sched_data) break; if (bfqq) { /* * Some queue changed cgroup so the merge is not valid * anymore. We cannot easily just cancel the merge (by * clearing new_bfqq) as there may be other processes * using this queue and holding refs to all queues * below sync_bfqq->new_bfqq. Similarly if the merge * already happened, we need to detach from bfqq now * so that we cannot merge bio to a request from the * old cgroup. */ bfq_put_cooperator(sync_bfqq); bic_set_bfqq(bic, NULL, true, act_idx); bfq_release_process_ref(bfqd, sync_bfqq); } } /** * __bfq_bic_change_cgroup - move @bic to @bfqg. * @bfqd: the queue descriptor. * @bic: the bic to move. * @bfqg: the group to move to. * * Move bic to blkcg, assuming that bfqd->lock is held; which makes * sure that the reference to cgroup is valid across the call (see * comments in bfq_bic_update_cgroup on this issue) */ static void __bfq_bic_change_cgroup(struct bfq_data *bfqd, struct bfq_io_cq *bic, struct bfq_group *bfqg) { unsigned int act_idx; for (act_idx = 0; act_idx < bfqd->num_actuators; act_idx++) { struct bfq_queue *async_bfqq = bic_to_bfqq(bic, false, act_idx); struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, true, act_idx); if (async_bfqq && async_bfqq->entity.sched_data != &bfqg->sched_data) { bic_set_bfqq(bic, NULL, false, act_idx); bfq_release_process_ref(bfqd, async_bfqq); } if (sync_bfqq) bfq_sync_bfqq_move(bfqd, sync_bfqq, bic, bfqg, act_idx); } } void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) { struct bfq_data *bfqd = bic_to_bfqd(bic); struct bfq_group *bfqg = bfq_bio_bfqg(bfqd, bio); uint64_t serial_nr; serial_nr = bfqg_to_blkg(bfqg)->blkcg->css.serial_nr; /* * Check whether blkcg has changed. The condition may trigger * spuriously on a newly created cic but there's no harm. */ if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) return; /* * New cgroup for this process. Make sure it is linked to bfq internal * cgroup hierarchy. */ bfq_link_bfqg(bfqd, bfqg); __bfq_bic_change_cgroup(bfqd, bic, bfqg); /* * Update blkg_path for bfq_log_* functions. We cache this * path, and update it here, for the following * reasons. Operations on blkg objects in blk-cgroup are * protected with the request_queue lock, and not with the * lock that protects the instances of this scheduler * (bfqd->lock). This exposes BFQ to the following sort of * race. * * The blkg_lookup performed in bfq_get_queue, protected * through rcu, may happen to return the address of a copy of * the original blkg. If this is the case, then the * bfqg_and_blkg_get performed in bfq_get_queue, to pin down * the blkg, is useless: it does not prevent blk-cgroup code * from destroying both the original blkg and all objects * directly or indirectly referred by the copy of the * blkg. * * On the bright side, destroy operations on a blkg invoke, as * a first step, hooks of the scheduler associated with the * blkg. And these hooks are executed with bfqd->lock held for * BFQ. As a consequence, for any blkg associated with the * request queue this instance of the scheduler is attached * to, we are guaranteed that such a blkg is not destroyed, and * that all the pointers it contains are consistent, while we * are holding bfqd->lock. A blkg_lookup performed with * bfqd->lock held then returns a fully consistent blkg, which * remains consistent until this lock is held. * * Thanks to the last fact, and to the fact that: (1) bfqg has * been obtained through a blkg_lookup in the above * assignment, and (2) bfqd->lock is being held, here we can * safely use the policy data for the involved blkg (i.e., the * field bfqg->pd) to get to the blkg associated with bfqg, * and then we can safely use any field of blkg. After we * release bfqd->lock, even just getting blkg through this * bfqg may cause dangling references to be traversed, as * bfqg->pd may not exist any more. * * In view of the above facts, here we cache, in the bfqg, any * blkg data we may need for this bic, and for its associated * bfq_queue. As of now, we need to cache only the path of the * blkg, which is used in the bfq_log_* functions. * * Finally, note that bfqg itself needs to be protected from * destruction on the blkg_free of the original blkg (which * invokes bfq_pd_free). We use an additional private * refcounter for bfqg, to let it disappear only after no * bfq_queue refers to it any longer. */ blkg_path(bfqg_to_blkg(bfqg), bfqg->blkg_path, sizeof(bfqg->blkg_path)); bic->blkcg_serial_nr = serial_nr; } /** * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. * @st: the service tree being flushed. */ static void bfq_flush_idle_tree(struct bfq_service_tree *st) { struct bfq_entity *entity = st->first_idle; for (; entity ; entity = st->first_idle) __bfq_deactivate_entity(entity, false); } /** * bfq_reparent_leaf_entity - move leaf entity to the root_group. * @bfqd: the device data structure with the root group. * @entity: the entity to move, if entity is a leaf; or the parent entity * of an active leaf entity to move, if entity is not a leaf. * @ioprio_class: I/O priority class to reparent. */ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, struct bfq_entity *entity, int ioprio_class) { struct bfq_queue *bfqq; struct bfq_entity *child_entity = entity; while (child_entity->my_sched_data) { /* leaf not reached yet */ struct bfq_sched_data *child_sd = child_entity->my_sched_data; struct bfq_service_tree *child_st = child_sd->service_tree + ioprio_class; struct rb_root *child_active = &child_st->active; child_entity = bfq_entity_of(rb_first(child_active)); if (!child_entity) child_entity = child_sd->in_service_entity; } bfqq = bfq_entity_to_bfqq(child_entity); bfq_bfqq_move(bfqd, bfqq, bfqd->root_group); } /** * bfq_reparent_active_queues - move to the root group all active queues. * @bfqd: the device data structure with the root group. * @bfqg: the group to move from. * @st: the service tree to start the search from. * @ioprio_class: I/O priority class to reparent. */ static void bfq_reparent_active_queues(struct bfq_data *bfqd, struct bfq_group *bfqg, struct bfq_service_tree *st, int ioprio_class) { struct rb_root *active = &st->active; struct bfq_entity *entity; while ((entity = bfq_entity_of(rb_first(active)))) bfq_reparent_leaf_entity(bfqd, entity, ioprio_class); if (bfqg->sched_data.in_service_entity) bfq_reparent_leaf_entity(bfqd, bfqg->sched_data.in_service_entity, ioprio_class); } /** * bfq_pd_offline - deactivate the entity associated with @pd, * and reparent its children entities. * @pd: descriptor of the policy going offline. * * blkio already grabs the queue_lock for us, so no need to use * RCU-based magic */ static void bfq_pd_offline(struct blkg_policy_data *pd) { struct bfq_service_tree *st; struct bfq_group *bfqg = pd_to_bfqg(pd); struct bfq_data *bfqd = bfqg->bfqd; struct bfq_entity *entity = bfqg->my_entity; unsigned long flags; int i; spin_lock_irqsave(&bfqd->lock, flags); if (!entity) /* root group */ goto put_async_queues; /* * Empty all service_trees belonging to this group before * deactivating the group itself. */ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { st = bfqg->sched_data.service_tree + i; /* * It may happen that some queues are still active * (busy) upon group destruction (if the corresponding * processes have been forced to terminate). We move * all the leaf entities corresponding to these queues * to the root_group. * Also, it may happen that the group has an entity * in service, which is disconnected from the active * tree: it must be moved, too. * There is no need to put the sync queues, as the * scheduler has taken no reference. */ bfq_reparent_active_queues(bfqd, bfqg, st, i); /* * The idle tree may still contain bfq_queues * belonging to exited task because they never * migrated to a different cgroup from the one being * destroyed now. In addition, even * bfq_reparent_active_queues() may happen to add some * entities to the idle tree. It happens if, in some * of the calls to bfq_bfqq_move() performed by * bfq_reparent_active_queues(), the queue to move is * empty and gets expired. */ bfq_flush_idle_tree(st); } __bfq_deactivate_entity(entity, false); put_async_queues: bfq_put_async_queues(bfqd, bfqg); spin_unlock_irqrestore(&bfqd->lock, flags); /* * @blkg is going offline and will be ignored by * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so * that they don't get lost. If IOs complete after this point, the * stats for them will be lost. Oh well... */ bfqg_stats_xfer_dead(bfqg); } void bfq_end_wr_async(struct bfq_data *bfqd) { struct blkcg_gq *blkg; list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) { struct bfq_group *bfqg = blkg_to_bfqg(blkg); bfq_end_wr_async_queues(bfqd, bfqg); } bfq_end_wr_async_queues(bfqd, bfqd->root_group); } static int bfq_io_show_weight_legacy(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); unsigned int val = 0; if (bfqgd) val = bfqgd->weight; seq_printf(sf, "%u\n", val); return 0; } static u64 bfqg_prfill_weight_device(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct bfq_group *bfqg = pd_to_bfqg(pd); if (!bfqg->entity.dev_weight) return 0; return __blkg_prfill_u64(sf, pd, bfqg->entity.dev_weight); } static int bfq_io_show_weight(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); seq_printf(sf, "default %u\n", bfqgd->weight); blkcg_print_blkgs(sf, blkcg, bfqg_prfill_weight_device, &blkcg_policy_bfq, 0, false); return 0; } static void bfq_group_set_weight(struct bfq_group *bfqg, u64 weight, u64 dev_weight) { weight = dev_weight ?: weight; bfqg->entity.dev_weight = dev_weight; /* * Setting the prio_changed flag of the entity * to 1 with new_weight == weight would re-set * the value of the weight to its ioprio mapping. * Set the flag only if necessary. */ if ((unsigned short)weight != bfqg->entity.new_weight) { bfqg->entity.new_weight = (unsigned short)weight; /* * Make sure that the above new value has been * stored in bfqg->entity.new_weight before * setting the prio_changed flag. In fact, * this flag may be read asynchronously (in * critical sections protected by a different * lock than that held here), and finding this * flag set may cause the execution of the code * for updating parameters whose value may * depend also on bfqg->entity.new_weight (in * __bfq_entity_update_weight_prio). * This barrier makes sure that the new value * of bfqg->entity.new_weight is correctly * seen in that code. */ smp_wmb(); bfqg->entity.prio_changed = 1; } } static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css, struct cftype *cftype, u64 val) { struct blkcg *blkcg = css_to_blkcg(css); struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); struct blkcg_gq *blkg; int ret = -ERANGE; if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT) return ret; ret = 0; spin_lock_irq(&blkcg->lock); bfqgd->weight = (unsigned short)val; hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { struct bfq_group *bfqg = blkg_to_bfqg(blkg); if (bfqg) bfq_group_set_weight(bfqg, val, 0); } spin_unlock_irq(&blkcg->lock); return ret; } static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { int ret; struct blkg_conf_ctx ctx; struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct bfq_group *bfqg; u64 v; blkg_conf_init(&ctx, buf); ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, &ctx); if (ret) goto out; if (sscanf(ctx.body, "%llu", &v) == 1) { /* require "default" on dfl */ ret = -ERANGE; if (!v) goto out; } else if (!strcmp(strim(ctx.body), "default")) { v = 0; } else { ret = -EINVAL; goto out; } bfqg = blkg_to_bfqg(ctx.blkg); ret = -ERANGE; if (!v || (v >= BFQ_MIN_WEIGHT && v <= BFQ_MAX_WEIGHT)) { bfq_group_set_weight(bfqg, bfqg->entity.weight, v); ret = 0; } out: blkg_conf_exit(&ctx); return ret ?: nbytes; } static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { char *endp; int ret; u64 v; buf = strim(buf); /* "WEIGHT" or "default WEIGHT" sets the default weight */ v = simple_strtoull(buf, &endp, 0); if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) { ret = bfq_io_set_weight_legacy(of_css(of), NULL, v); return ret ?: nbytes; } return bfq_io_set_device_weight(of, buf, nbytes, off); } static int bfqg_print_rwstat(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, &blkcg_policy_bfq, seq_cft(sf)->private, true); return 0; } static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct blkg_rwstat_sample sum; blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off, &sum); return __blkg_prfill_rwstat(sf, pd, &sum); } static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq, seq_cft(sf)->private, true); return 0; } #ifdef CONFIG_BFQ_CGROUP_DEBUG static int bfqg_print_stat(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, &blkcg_policy_bfq, seq_cft(sf)->private, false); return 0; } static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct blkcg_gq *blkg = pd_to_blkg(pd); struct blkcg_gq *pos_blkg; struct cgroup_subsys_state *pos_css; u64 sum = 0; lockdep_assert_held(&blkg->q->queue_lock); rcu_read_lock(); blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { struct bfq_stat *stat; if (!pos_blkg->online) continue; stat = (void *)blkg_to_pd(pos_blkg, &blkcg_policy_bfq) + off; sum += bfq_stat_read(stat) + atomic64_read(&stat->aux_cnt); } rcu_read_unlock(); return __blkg_prfill_u64(sf, pd, sum); } static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), bfqg_prfill_stat_recursive, &blkcg_policy_bfq, seq_cft(sf)->private, false); return 0; } static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct bfq_group *bfqg = blkg_to_bfqg(pd->blkg); u64 sum = blkg_rwstat_total(&bfqg->stats.bytes); return __blkg_prfill_u64(sf, pd, sum >> 9); } static int bfqg_print_stat_sectors(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false); return 0; } static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct blkg_rwstat_sample tmp; blkg_rwstat_recursive_sum(pd->blkg, &blkcg_policy_bfq, offsetof(struct bfq_group, stats.bytes), &tmp); return __blkg_prfill_u64(sf, pd, (tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]) >> 9); } static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0, false); return 0; } static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct bfq_group *bfqg = pd_to_bfqg(pd); u64 samples = bfq_stat_read(&bfqg->stats.avg_queue_size_samples); u64 v = 0; if (samples) { v = bfq_stat_read(&bfqg->stats.avg_queue_size_sum); v = div64_u64(v, samples); } __blkg_prfill_u64(sf, pd, v); return 0; } /* print avg_queue_size */ static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), bfqg_prfill_avg_queue_size, &blkcg_policy_bfq, 0, false); return 0; } #endif /* CONFIG_BFQ_CGROUP_DEBUG */ struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) { int ret; ret = blkcg_activate_policy(bfqd->queue->disk, &blkcg_policy_bfq); if (ret) return NULL; return blkg_to_bfqg(bfqd->queue->root_blkg); } struct blkcg_policy blkcg_policy_bfq = { .dfl_cftypes = bfq_blkg_files, .legacy_cftypes = bfq_blkcg_legacy_files, .cpd_alloc_fn = bfq_cpd_alloc, .cpd_free_fn = bfq_cpd_free, .pd_alloc_fn = bfq_pd_alloc, .pd_init_fn = bfq_pd_init, .pd_offline_fn = bfq_pd_offline, .pd_free_fn = bfq_pd_free, .pd_reset_stats_fn = bfq_pd_reset_stats, }; struct cftype bfq_blkcg_legacy_files[] = { { .name = "bfq.weight", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = bfq_io_show_weight_legacy, .write_u64 = bfq_io_set_weight_legacy, }, { .name = "bfq.weight_device", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = bfq_io_show_weight, .write = bfq_io_set_weight, }, /* statistics, covers only the tasks in the bfqg */ { .name = "bfq.io_service_bytes", .private = offsetof(struct bfq_group, stats.bytes), .seq_show = bfqg_print_rwstat, }, { .name = "bfq.io_serviced", .private = offsetof(struct bfq_group, stats.ios), .seq_show = bfqg_print_rwstat, }, #ifdef CONFIG_BFQ_CGROUP_DEBUG { .name = "bfq.time", .private = offsetof(struct bfq_group, stats.time), .seq_show = bfqg_print_stat, }, { .name = "bfq.sectors", .seq_show = bfqg_print_stat_sectors, }, { .name = "bfq.io_service_time", .private = offsetof(struct bfq_group, stats.service_time), .seq_show = bfqg_print_rwstat, }, { .name = "bfq.io_wait_time", .private = offsetof(struct bfq_group, stats.wait_time), .seq_show = bfqg_print_rwstat, }, { .name = "bfq.io_merged", .private = offsetof(struct bfq_group, stats.merged), .seq_show = bfqg_print_rwstat, }, { .name = "bfq.io_queued", .private = offsetof(struct bfq_group, stats.queued), .seq_show = bfqg_print_rwstat, }, #endif /* CONFIG_BFQ_CGROUP_DEBUG */ /* the same statistics which cover the bfqg and its descendants */ { .name = "bfq.io_service_bytes_recursive", .private = offsetof(struct bfq_group, stats.bytes), .seq_show = bfqg_print_rwstat_recursive, }, { .name = "bfq.io_serviced_recursive", .private = offsetof(struct bfq_group, stats.ios), .seq_show = bfqg_print_rwstat_recursive, }, #ifdef CONFIG_BFQ_CGROUP_DEBUG { .name = "bfq.time_recursive", .private = offsetof(struct bfq_group, stats.time), .seq_show = bfqg_print_stat_recursive, }, { .name = "bfq.sectors_recursive", .seq_show = bfqg_print_stat_sectors_recursive, }, { .name = "bfq.io_service_time_recursive", .private = offsetof(struct bfq_group, stats.service_time), .seq_show = bfqg_print_rwstat_recursive, }, { .name = "bfq.io_wait_time_recursive", .private = offsetof(struct bfq_group, stats.wait_time), .seq_show = bfqg_print_rwstat_recursive, }, { .name = "bfq.io_merged_recursive", .private = offsetof(struct bfq_group, stats.merged), .seq_show = bfqg_print_rwstat_recursive, }, { .name = "bfq.io_queued_recursive", .private = offsetof(struct bfq_group, stats.queued), .seq_show = bfqg_print_rwstat_recursive, }, { .name = "bfq.avg_queue_size", .seq_show = bfqg_print_avg_queue_size, }, { .name = "bfq.group_wait_time", .private = offsetof(struct bfq_group, stats.group_wait_time), .seq_show = bfqg_print_stat, }, { .name = "bfq.idle_time", .private = offsetof(struct bfq_group, stats.idle_time), .seq_show = bfqg_print_stat, }, { .name = "bfq.empty_time", .private = offsetof(struct bfq_group, stats.empty_time), .seq_show = bfqg_print_stat, }, { .name = "bfq.dequeue", .private = offsetof(struct bfq_group, stats.dequeue), .seq_show = bfqg_print_stat, }, #endif /* CONFIG_BFQ_CGROUP_DEBUG */ { } /* terminate */ }; struct cftype bfq_blkg_files[] = { { .name = "bfq.weight", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = bfq_io_show_weight, .write = bfq_io_set_weight, }, {} /* terminate */ }; #else /* CONFIG_BFQ_GROUP_IOSCHED */ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_group *bfqg) {} void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg) { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); entity->weight = entity->new_weight; entity->orig_weight = entity->new_weight; if (bfqq) { bfqq->ioprio = bfqq->new_ioprio; bfqq->ioprio_class = bfqq->new_ioprio_class; } entity->sched_data = &bfqg->sched_data; } void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {} void bfq_end_wr_async(struct bfq_data *bfqd) { bfq_end_wr_async_queues(bfqd, bfqd->root_group); } struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio) { return bfqd->root_group; } struct bfq_group *bfqq_group(struct bfq_queue *bfqq) { return bfqq->bfqd->root_group; } void bfqg_and_blkg_put(struct bfq_group *bfqg) {} struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) { struct bfq_group *bfqg; int i; bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); if (!bfqg) return NULL; for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; return bfqg; } #endif /* CONFIG_BFQ_GROUP_IOSCHED */ |
1 1 1 1 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 | // SPDX-License-Identifier: GPL-2.0-only /* The industrial I/O core * * Copyright (c) 2008 Jonathan Cameron * * Handling of buffer allocation / resizing. * * Things to look at here. * - Better memory allocation techniques? * - Alternative access techniques? */ #include <linux/anon_inodes.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/device.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/cdev.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/sched/signal.h> #include <linux/iio/iio.h> #include <linux/iio/iio-opaque.h> #include "iio_core.h" #include "iio_core_trigger.h" #include <linux/iio/sysfs.h> #include <linux/iio/buffer.h> #include <linux/iio/buffer_impl.h> static const char * const iio_endian_prefix[] = { [IIO_BE] = "be", [IIO_LE] = "le", }; static bool iio_buffer_is_active(struct iio_buffer *buf) { return !list_empty(&buf->buffer_list); } static size_t iio_buffer_data_available(struct iio_buffer *buf) { return buf->access->data_available(buf); } static int iio_buffer_flush_hwfifo(struct iio_dev *indio_dev, struct iio_buffer *buf, size_t required) { if (!indio_dev->info->hwfifo_flush_to_buffer) return -ENODEV; return indio_dev->info->hwfifo_flush_to_buffer(indio_dev, required); } static bool iio_buffer_ready(struct iio_dev *indio_dev, struct iio_buffer *buf, size_t to_wait, int to_flush) { size_t avail; int flushed = 0; /* wakeup if the device was unregistered */ if (!indio_dev->info) return true; /* drain the buffer if it was disabled */ if (!iio_buffer_is_active(buf)) { to_wait = min_t(size_t, to_wait, 1); to_flush = 0; } avail = iio_buffer_data_available(buf); if (avail >= to_wait) { /* force a flush for non-blocking reads */ if (!to_wait && avail < to_flush) iio_buffer_flush_hwfifo(indio_dev, buf, to_flush - avail); return true; } if (to_flush) flushed = iio_buffer_flush_hwfifo(indio_dev, buf, to_wait - avail); if (flushed <= 0) return false; if (avail + flushed >= to_wait) return true; return false; } /** * iio_buffer_read() - chrdev read for buffer access * @filp: File structure pointer for the char device * @buf: Destination buffer for iio buffer read * @n: First n bytes to read * @f_ps: Long offset provided by the user as a seek position * * This function relies on all buffer implementations having an * iio_buffer as their first element. * * Return: negative values corresponding to error codes or ret != 0 * for ending the reading activity **/ static ssize_t iio_buffer_read(struct file *filp, char __user *buf, size_t n, loff_t *f_ps) { struct iio_dev_buffer_pair *ib = filp->private_data; struct iio_buffer *rb = ib->buffer; struct iio_dev *indio_dev = ib->indio_dev; DEFINE_WAIT_FUNC(wait, woken_wake_function); size_t datum_size; size_t to_wait; int ret = 0; if (!indio_dev->info) return -ENODEV; if (!rb || !rb->access->read) return -EINVAL; if (rb->direction != IIO_BUFFER_DIRECTION_IN) return -EPERM; datum_size = rb->bytes_per_datum; /* * If datum_size is 0 there will never be anything to read from the * buffer, so signal end of file now. */ if (!datum_size) return 0; if (filp->f_flags & O_NONBLOCK) to_wait = 0; else to_wait = min_t(size_t, n / datum_size, rb->watermark); add_wait_queue(&rb->pollq, &wait); do { if (!indio_dev->info) { ret = -ENODEV; break; } if (!iio_buffer_ready(indio_dev, rb, to_wait, n / datum_size)) { if (signal_pending(current)) { ret = -ERESTARTSYS; break; } wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); continue; } ret = rb->access->read(rb, n, buf); if (ret == 0 && (filp->f_flags & O_NONBLOCK)) ret = -EAGAIN; } while (ret == 0); remove_wait_queue(&rb->pollq, &wait); return ret; } static size_t iio_buffer_space_available(struct iio_buffer *buf) { if (buf->access->space_available) return buf->access->space_available(buf); return SIZE_MAX; } static ssize_t iio_buffer_write(struct file *filp, const char __user *buf, size_t n, loff_t *f_ps) { struct iio_dev_buffer_pair *ib = filp->private_data; struct iio_buffer *rb = ib->buffer; struct iio_dev *indio_dev = ib->indio_dev; DEFINE_WAIT_FUNC(wait, woken_wake_function); int ret = 0; size_t written; if (!indio_dev->info) return -ENODEV; if (!rb || !rb->access->write) return -EINVAL; if (rb->direction != IIO_BUFFER_DIRECTION_OUT) return -EPERM; written = 0; add_wait_queue(&rb->pollq, &wait); do { if (!indio_dev->info) return -ENODEV; if (!iio_buffer_space_available(rb)) { if (signal_pending(current)) { ret = -ERESTARTSYS; break; } if (filp->f_flags & O_NONBLOCK) { if (!written) ret = -EAGAIN; break; } wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); continue; } ret = rb->access->write(rb, n - written, buf + written); if (ret < 0) break; written += ret; } while (written != n); remove_wait_queue(&rb->pollq, &wait); return ret < 0 ? ret : written; } /** * iio_buffer_poll() - poll the buffer to find out if it has data * @filp: File structure pointer for device access * @wait: Poll table structure pointer for which the driver adds * a wait queue * * Return: (EPOLLIN | EPOLLRDNORM) if data is available for reading * or 0 for other cases */ static __poll_t iio_buffer_poll(struct file *filp, struct poll_table_struct *wait) { struct iio_dev_buffer_pair *ib = filp->private_data; struct iio_buffer *rb = ib->buffer; struct iio_dev *indio_dev = ib->indio_dev; if (!indio_dev->info || !rb) return 0; poll_wait(filp, &rb->pollq, wait); switch (rb->direction) { case IIO_BUFFER_DIRECTION_IN: if (iio_buffer_ready(indio_dev, rb, rb->watermark, 0)) return EPOLLIN | EPOLLRDNORM; break; case IIO_BUFFER_DIRECTION_OUT: if (iio_buffer_space_available(rb)) return EPOLLOUT | EPOLLWRNORM; break; } return 0; } ssize_t iio_buffer_read_wrapper(struct file *filp, char __user *buf, size_t n, loff_t *f_ps) { struct iio_dev_buffer_pair *ib = filp->private_data; struct iio_buffer *rb = ib->buffer; /* check if buffer was opened through new API */ if (test_bit(IIO_BUSY_BIT_POS, &rb->flags)) return -EBUSY; return iio_buffer_read(filp, buf, n, f_ps); } ssize_t iio_buffer_write_wrapper(struct file *filp, const char __user *buf, size_t n, loff_t *f_ps) { struct iio_dev_buffer_pair *ib = filp->private_data; struct iio_buffer *rb = ib->buffer; /* check if buffer was opened through new API */ if (test_bit(IIO_BUSY_BIT_POS, &rb->flags)) return -EBUSY; return iio_buffer_write(filp, buf, n, f_ps); } __poll_t iio_buffer_poll_wrapper(struct file *filp, struct poll_table_struct *wait) { struct iio_dev_buffer_pair *ib = filp->private_data; struct iio_buffer *rb = ib->buffer; /* check if buffer was opened through new API */ if (test_bit(IIO_BUSY_BIT_POS, &rb->flags)) return 0; return iio_buffer_poll(filp, wait); } /** * iio_buffer_wakeup_poll - Wakes up the buffer waitqueue * @indio_dev: The IIO device * * Wakes up the event waitqueue used for poll(). Should usually * be called when the device is unregistered. */ void iio_buffer_wakeup_poll(struct iio_dev *indio_dev) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); struct iio_buffer *buffer; unsigned int i; for (i = 0; i < iio_dev_opaque->attached_buffers_cnt; i++) { buffer = iio_dev_opaque->attached_buffers[i]; wake_up(&buffer->pollq); } } int iio_pop_from_buffer(struct iio_buffer *buffer, void *data) { if (!buffer || !buffer->access || !buffer->access->remove_from) return -EINVAL; return buffer->access->remove_from(buffer, data); } EXPORT_SYMBOL_GPL(iio_pop_from_buffer); void iio_buffer_init(struct iio_buffer *buffer) { INIT_LIST_HEAD(&buffer->demux_list); INIT_LIST_HEAD(&buffer->buffer_list); init_waitqueue_head(&buffer->pollq); kref_init(&buffer->ref); if (!buffer->watermark) buffer->watermark = 1; } EXPORT_SYMBOL(iio_buffer_init); void iio_device_detach_buffers(struct iio_dev *indio_dev) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); struct iio_buffer *buffer; unsigned int i; for (i = 0; i < iio_dev_opaque->attached_buffers_cnt; i++) { buffer = iio_dev_opaque->attached_buffers[i]; iio_buffer_put(buffer); } kfree(iio_dev_opaque->attached_buffers); } static ssize_t iio_show_scan_index(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", to_iio_dev_attr(attr)->c->scan_index); } static ssize_t iio_show_fixed_type(struct device *dev, struct device_attribute *attr, char *buf) { struct iio_dev_attr *this_attr = to_iio_dev_attr(attr); u8 type = this_attr->c->scan_type.endianness; if (type == IIO_CPU) { #ifdef __LITTLE_ENDIAN type = IIO_LE; #else type = IIO_BE; #endif } if (this_attr->c->scan_type.repeat > 1) return sysfs_emit(buf, "%s:%c%d/%dX%d>>%u\n", iio_endian_prefix[type], this_attr->c->scan_type.sign, this_attr->c->scan_type.realbits, this_attr->c->scan_type.storagebits, this_attr->c->scan_type.repeat, this_attr->c->scan_type.shift); else return sysfs_emit(buf, "%s:%c%d/%d>>%u\n", iio_endian_prefix[type], this_attr->c->scan_type.sign, this_attr->c->scan_type.realbits, this_attr->c->scan_type.storagebits, this_attr->c->scan_type.shift); } static ssize_t iio_scan_el_show(struct device *dev, struct device_attribute *attr, char *buf) { int ret; struct iio_buffer *buffer = to_iio_dev_attr(attr)->buffer; /* Ensure ret is 0 or 1. */ ret = !!test_bit(to_iio_dev_attr(attr)->address, buffer->scan_mask); return sysfs_emit(buf, "%d\n", ret); } /* Note NULL used as error indicator as it doesn't make sense. */ static const unsigned long *iio_scan_mask_match(const unsigned long *av_masks, unsigned int masklength, const unsigned long *mask, bool strict) { if (bitmap_empty(mask, masklength)) return NULL; /* * The condition here do not handle multi-long masks correctly. * It only checks the first long to be zero, and will use such mask * as a terminator even if there was bits set after the first long. * * Correct check would require using: * while (!bitmap_empty(av_masks, masklength)) * instead. This is potentially hazardous because the * avaliable_scan_masks is a zero terminated array of longs - and * using the proper bitmap_empty() check for multi-long wide masks * would require the array to be terminated with multiple zero longs - * which is not such an usual pattern. * * As writing of this no multi-long wide masks were found in-tree, so * the simple while (*av_masks) check is working. */ while (*av_masks) { if (strict) { if (bitmap_equal(mask, av_masks, masklength)) return av_masks; } else { if (bitmap_subset(mask, av_masks, masklength)) return av_masks; } av_masks += BITS_TO_LONGS(masklength); } return NULL; } static bool iio_validate_scan_mask(struct iio_dev *indio_dev, const unsigned long *mask) { if (!indio_dev->setup_ops->validate_scan_mask) return true; return indio_dev->setup_ops->validate_scan_mask(indio_dev, mask); } /** * iio_scan_mask_set() - set particular bit in the scan mask * @indio_dev: the iio device * @buffer: the buffer whose scan mask we are interested in * @bit: the bit to be set. * * Note that at this point we have no way of knowing what other * buffers might request, hence this code only verifies that the * individual buffers request is plausible. */ static int iio_scan_mask_set(struct iio_dev *indio_dev, struct iio_buffer *buffer, int bit) { const unsigned long *mask; unsigned long *trialmask; if (!indio_dev->masklength) { WARN(1, "Trying to set scanmask prior to registering buffer\n"); return -EINVAL; } trialmask = bitmap_alloc(indio_dev->masklength, GFP_KERNEL); if (!trialmask) return -ENOMEM; bitmap_copy(trialmask, buffer->scan_mask, indio_dev->masklength); set_bit(bit, trialmask); if (!iio_validate_scan_mask(indio_dev, trialmask)) goto err_invalid_mask; if (indio_dev->available_scan_masks) { mask = iio_scan_mask_match(indio_dev->available_scan_masks, indio_dev->masklength, trialmask, false); if (!mask) goto err_invalid_mask; } bitmap_copy(buffer->scan_mask, trialmask, indio_dev->masklength); bitmap_free(trialmask); return 0; err_invalid_mask: bitmap_free(trialmask); return -EINVAL; } static int iio_scan_mask_clear(struct iio_buffer *buffer, int bit) { clear_bit(bit, buffer->scan_mask); return 0; } static int iio_scan_mask_query(struct iio_dev *indio_dev, struct iio_buffer *buffer, int bit) { if (bit > indio_dev->masklength) return -EINVAL; if (!buffer->scan_mask) return 0; /* Ensure return value is 0 or 1. */ return !!test_bit(bit, buffer->scan_mask); }; static ssize_t iio_scan_el_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { int ret; bool state; struct iio_dev *indio_dev = dev_to_iio_dev(dev); struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); struct iio_dev_attr *this_attr = to_iio_dev_attr(attr); struct iio_buffer *buffer = this_attr->buffer; ret = kstrtobool(buf, &state); if (ret < 0) return ret; mutex_lock(&iio_dev_opaque->mlock); if (iio_buffer_is_active(buffer)) { ret = -EBUSY; goto error_ret; } ret = iio_scan_mask_query(indio_dev, buffer, this_attr->address); if (ret < 0) goto error_ret; if (!state && ret) { ret = iio_scan_mask_clear(buffer, this_attr->address); if (ret) goto error_ret; } else if (state && !ret) { ret = iio_scan_mask_set(indio_dev, buffer, this_attr->address); if (ret) goto error_ret; } error_ret: mutex_unlock(&iio_dev_opaque->mlock); return ret < 0 ? ret : len; } static ssize_t iio_scan_el_ts_show(struct device *dev, struct device_attribute *attr, char *buf) { struct iio_buffer *buffer = to_iio_dev_attr(attr)->buffer; return sysfs_emit(buf, "%d\n", buffer->scan_timestamp); } static ssize_t iio_scan_el_ts_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { int ret; struct iio_dev *indio_dev = dev_to_iio_dev(dev); struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); struct iio_buffer *buffer = to_iio_dev_attr(attr)->buffer; bool state; ret = kstrtobool(buf, &state); if (ret < 0) return ret; mutex_lock(&iio_dev_opaque->mlock); if (iio_buffer_is_active(buffer)) { ret = -EBUSY; goto error_ret; } buffer->scan_timestamp = state; error_ret: mutex_unlock(&iio_dev_opaque->mlock); return ret ? ret : len; } static int iio_buffer_add_channel_sysfs(struct iio_dev *indio_dev, struct iio_buffer *buffer, const struct iio_chan_spec *chan) { int ret, attrcount = 0; ret = __iio_add_chan_devattr("index", chan, &iio_show_scan_index, NULL, 0, IIO_SEPARATE, &indio_dev->dev, buffer, &buffer->buffer_attr_list); if (ret) return ret; attrcount++; ret = __iio_add_chan_devattr("type", chan, &iio_show_fixed_type, NULL, 0, IIO_SEPARATE, &indio_dev->dev, buffer, &buffer->buffer_attr_list); if (ret) return ret; attrcount++; if (chan->type != IIO_TIMESTAMP) ret = __iio_add_chan_devattr("en", chan, &iio_scan_el_show, &iio_scan_el_store, chan->scan_index, IIO_SEPARATE, &indio_dev->dev, buffer, &buffer->buffer_attr_list); else ret = __iio_add_chan_devattr("en", chan, &iio_scan_el_ts_show, &iio_scan_el_ts_store, chan->scan_index, IIO_SEPARATE, &indio_dev->dev, buffer, &buffer->buffer_attr_list); if (ret) return ret; attrcount++; ret = attrcount; return ret; } static ssize_t length_show(struct device *dev, struct device_attribute *attr, char *buf) { struct iio_buffer *buffer = to_iio_dev_attr(attr)->buffer; return sysfs_emit(buf, "%d\n", buffer->length); } static ssize_t length_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct iio_dev *indio_dev = dev_to_iio_dev(dev); struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); struct iio_buffer *buffer = to_iio_dev_attr(attr)->buffer; unsigned int val; int ret; ret = kstrtouint(buf, 10, &val); if (ret) return ret; if (val == buffer->length) return len; mutex_lock(&iio_dev_opaque->mlock); if (iio_buffer_is_active(buffer)) { ret = -EBUSY; } else { buffer->access->set_length(buffer, val); ret = 0; } if (ret) goto out; if (buffer->length && buffer->length < buffer->watermark) buffer->watermark = buffer->length; out: mutex_unlock(&iio_dev_opaque->mlock); return ret ? ret : len; } static ssize_t enable_show(struct device *dev, struct device_attribute *attr, char *buf) { struct iio_buffer *buffer = to_iio_dev_attr(attr)->buffer; return sysfs_emit(buf, "%d\n", iio_buffer_is_active(buffer)); } static unsigned int iio_storage_bytes_for_si(struct iio_dev *indio_dev, unsigned int scan_index) { const struct iio_chan_spec *ch; unsigned int bytes; ch = iio_find_channel_from_si(indio_dev, scan_index); bytes = ch->scan_type.storagebits / 8; if (ch->scan_type.repeat > 1) bytes *= ch->scan_type.repeat; return bytes; } static unsigned int iio_storage_bytes_for_timestamp(struct iio_dev *indio_dev) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); return iio_storage_bytes_for_si(indio_dev, iio_dev_opaque->scan_index_timestamp); } static int iio_compute_scan_bytes(struct iio_dev *indio_dev, const unsigned long *mask, bool timestamp) { unsigned int bytes = 0; int length, i, largest = 0; /* How much space will the demuxed element take? */ for_each_set_bit(i, mask, indio_dev->masklength) { length = iio_storage_bytes_for_si(indio_dev, i); bytes = ALIGN(bytes, length); bytes += length; largest = max(largest, length); } if (timestamp) { length = iio_storage_bytes_for_timestamp(indio_dev); bytes = ALIGN(bytes, length); bytes += length; largest = max(largest, length); } bytes = ALIGN(bytes, largest); return bytes; } static void iio_buffer_activate(struct iio_dev *indio_dev, struct iio_buffer *buffer) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); iio_buffer_get(buffer); list_add(&buffer->buffer_list, &iio_dev_opaque->buffer_list); } static void iio_buffer_deactivate(struct iio_buffer *buffer) { list_del_init(&buffer->buffer_list); wake_up_interruptible(&buffer->pollq); iio_buffer_put(buffer); } static void iio_buffer_deactivate_all(struct iio_dev *indio_dev) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); struct iio_buffer *buffer, *_buffer; list_for_each_entry_safe(buffer, _buffer, &iio_dev_opaque->buffer_list, buffer_list) iio_buffer_deactivate(buffer); } static int iio_buffer_enable(struct iio_buffer *buffer, struct iio_dev *indio_dev) { if (!buffer->access->enable) return 0; return buffer->access->enable(buffer, indio_dev); } static int iio_buffer_disable(struct iio_buffer *buffer, struct iio_dev *indio_dev) { if (!buffer->access->disable) return 0; return buffer->access->disable(buffer, indio_dev); } static void iio_buffer_update_bytes_per_datum(struct iio_dev *indio_dev, struct iio_buffer *buffer) { unsigned int bytes; if (!buffer->access->set_bytes_per_datum) return; bytes = iio_compute_scan_bytes(indio_dev, buffer->scan_mask, buffer->scan_timestamp); buffer->access->set_bytes_per_datum(buffer, bytes); } static int iio_buffer_request_update(struct iio_dev *indio_dev, struct iio_buffer *buffer) { int ret; iio_buffer_update_bytes_per_datum(indio_dev, buffer); if (buffer->access->request_update) { ret = buffer->access->request_update(buffer); if (ret) { dev_dbg(&indio_dev->dev, "Buffer not started: buffer parameter update failed (%d)\n", ret); return ret; } } return 0; } static void iio_free_scan_mask(struct iio_dev *indio_dev, const unsigned long *mask) { /* If the mask is dynamically allocated free it, otherwise do nothing */ if (!indio_dev->available_scan_masks) bitmap_free(mask); } struct iio_device_config { unsigned int mode; unsigned int watermark; const unsigned long *scan_mask; unsigned int scan_bytes; bool scan_timestamp; }; static int iio_verify_update(struct iio_dev *indio_dev, struct iio_buffer *insert_buffer, struct iio_buffer *remove_buffer, struct iio_device_config *config) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); unsigned long *compound_mask; const unsigned long *scan_mask; bool strict_scanmask = false; struct iio_buffer *buffer; bool scan_timestamp; unsigned int modes; if (insert_buffer && bitmap_empty(insert_buffer->scan_mask, indio_dev->masklength)) { dev_dbg(&indio_dev->dev, "At least one scan element must be enabled first\n"); return -EINVAL; } memset(config, 0, sizeof(*config)); config->watermark = ~0; /* * If there is just one buffer and we are removing it there is nothing * to verify. */ if (remove_buffer && !insert_buffer && list_is_singular(&iio_dev_opaque->buffer_list)) return 0; modes = indio_dev->modes; list_for_each_entry(buffer, &iio_dev_opaque->buffer_list, buffer_list) { if (buffer == remove_buffer) continue; modes &= buffer->access->modes; config->watermark = min(config->watermark, buffer->watermark); } if (insert_buffer) { modes &= insert_buffer->access->modes; config->watermark = min(config->watermark, insert_buffer->watermark); } /* Definitely possible for devices to support both of these. */ if ((modes & INDIO_BUFFER_TRIGGERED) && indio_dev->trig) { config->mode = INDIO_BUFFER_TRIGGERED; } else if (modes & INDIO_BUFFER_HARDWARE) { /* * Keep things simple for now and only allow a single buffer to * be connected in hardware mode. */ if (insert_buffer && !list_empty(&iio_dev_opaque->buffer_list)) return -EINVAL; config->mode = INDIO_BUFFER_HARDWARE; strict_scanmask = true; } else if (modes & INDIO_BUFFER_SOFTWARE) { config->mode = INDIO_BUFFER_SOFTWARE; } else { /* Can only occur on first buffer */ if (indio_dev->modes & INDIO_BUFFER_TRIGGERED) dev_dbg(&indio_dev->dev, "Buffer not started: no trigger\n"); return -EINVAL; } /* What scan mask do we actually have? */ compound_mask = bitmap_zalloc(indio_dev->masklength, GFP_KERNEL); if (!compound_mask) return -ENOMEM; scan_timestamp = false; list_for_each_entry(buffer, &iio_dev_opaque->buffer_list, buffer_list) { if (buffer == remove_buffer) continue; bitmap_or(compound_mask, compound_mask, buffer->scan_mask, indio_dev->masklength); scan_timestamp |= buffer->scan_timestamp; } if (insert_buffer) { bitmap_or(compound_mask, compound_mask, insert_buffer->scan_mask, indio_dev->masklength); scan_timestamp |= insert_buffer->scan_timestamp; } if (indio_dev->available_scan_masks) { scan_mask = iio_scan_mask_match(indio_dev->available_scan_masks, indio_dev->masklength, compound_mask, strict_scanmask); bitmap_free(compound_mask); if (!scan_mask) return -EINVAL; } else { scan_mask = compound_mask; } config->scan_bytes = iio_compute_scan_bytes(indio_dev, scan_mask, scan_timestamp); config->scan_mask = scan_mask; config->scan_timestamp = scan_timestamp; return 0; } /** * struct iio_demux_table - table describing demux memcpy ops * @from: index to copy from * @to: index to copy to * @length: how many bytes to copy * @l: list head used for management */ struct iio_demux_table { unsigned int from; unsigned int to; unsigned int length; struct list_head l; }; static void iio_buffer_demux_free(struct iio_buffer *buffer) { struct iio_demux_table *p, *q; list_for_each_entry_safe(p, q, &buffer->demux_list, l) { list_del(&p->l); kfree(p); } } static int iio_buffer_add_demux(struct iio_buffer *buffer, struct iio_demux_table **p, unsigned int in_loc, unsigned int out_loc, unsigned int length) { if (*p && (*p)->from + (*p)->length == in_loc && (*p)->to + (*p)->length == out_loc) { (*p)->length += length; } else { *p = kmalloc(sizeof(**p), GFP_KERNEL); if (!(*p)) return -ENOMEM; (*p)->from = in_loc; (*p)->to = out_loc; (*p)->length = length; list_add_tail(&(*p)->l, &buffer->demux_list); } return 0; } static int iio_buffer_update_demux(struct iio_dev *indio_dev, struct iio_buffer *buffer) { int ret, in_ind = -1, out_ind, length; unsigned int in_loc = 0, out_loc = 0; struct iio_demux_table *p = NULL; /* Clear out any old demux */ iio_buffer_demux_free(buffer); kfree(buffer->demux_bounce); buffer->demux_bounce = NULL; /* First work out which scan mode we will actually have */ if (bitmap_equal(indio_dev->active_scan_mask, buffer->scan_mask, indio_dev->masklength)) return 0; /* Now we have the two masks, work from least sig and build up sizes */ for_each_set_bit(out_ind, buffer->scan_mask, indio_dev->masklength) { in_ind = find_next_bit(indio_dev->active_scan_mask, indio_dev->masklength, in_ind + 1); while (in_ind != out_ind) { length = iio_storage_bytes_for_si(indio_dev, in_ind); /* Make sure we are aligned */ in_loc = roundup(in_loc, length) + length; in_ind = find_next_bit(indio_dev->active_scan_mask, indio_dev->masklength, in_ind + 1); } length = iio_storage_bytes_for_si(indio_dev, in_ind); out_loc = roundup(out_loc, length); in_loc = roundup(in_loc, length); ret = iio_buffer_add_demux(buffer, &p, in_loc, out_loc, length); if (ret) goto error_clear_mux_table; out_loc += length; in_loc += length; } /* Relies on scan_timestamp being last */ if (buffer->scan_timestamp) { length = iio_storage_bytes_for_timestamp(indio_dev); out_loc = roundup(out_loc, length); in_loc = roundup(in_loc, length); ret = iio_buffer_add_demux(buffer, &p, in_loc, out_loc, length); if (ret) goto error_clear_mux_table; out_loc += length; } buffer->demux_bounce = kzalloc(out_loc, GFP_KERNEL); if (!buffer->demux_bounce) { ret = -ENOMEM; goto error_clear_mux_table; } return 0; error_clear_mux_table: iio_buffer_demux_free(buffer); return ret; } static int iio_update_demux(struct iio_dev *indio_dev) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); struct iio_buffer *buffer; int ret; list_for_each_entry(buffer, &iio_dev_opaque->buffer_list, buffer_list) { ret = iio_buffer_update_demux(indio_dev, buffer); if (ret < 0) goto error_clear_mux_table; } return 0; error_clear_mux_table: list_for_each_entry(buffer, &iio_dev_opaque->buffer_list, buffer_list) iio_buffer_demux_free(buffer); return ret; } static int iio_enable_buffers(struct iio_dev *indio_dev, struct iio_device_config *config) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); struct iio_buffer *buffer, *tmp = NULL; int ret; indio_dev->active_scan_mask = config->scan_mask; indio_dev->scan_timestamp = config->scan_timestamp; indio_dev->scan_bytes = config->scan_bytes; iio_dev_opaque->currentmode = config->mode; iio_update_demux(indio_dev); /* Wind up again */ if (indio_dev->setup_ops->preenable) { ret = indio_dev->setup_ops->preenable(indio_dev); if (ret) { dev_dbg(&indio_dev->dev, "Buffer not started: buffer preenable failed (%d)\n", ret); goto err_undo_config; } } if (indio_dev->info->update_scan_mode) { ret = indio_dev->info ->update_scan_mode(indio_dev, indio_dev->active_scan_mask); if (ret < 0) { dev_dbg(&indio_dev->dev, "Buffer not started: update scan mode failed (%d)\n", ret); goto err_run_postdisable; } } if (indio_dev->info->hwfifo_set_watermark) indio_dev->info->hwfifo_set_watermark(indio_dev, config->watermark); list_for_each_entry(buffer, &iio_dev_opaque->buffer_list, buffer_list) { ret = iio_buffer_enable(buffer, indio_dev); if (ret) { tmp = buffer; goto err_disable_buffers; } } if (iio_dev_opaque->currentmode == INDIO_BUFFER_TRIGGERED) { ret = iio_trigger_attach_poll_func(indio_dev->trig, indio_dev->pollfunc); if (ret) goto err_disable_buffers; } if (indio_dev->setup_ops->postenable) { ret = indio_dev->setup_ops->postenable(indio_dev); if (ret) { dev_dbg(&indio_dev->dev, "Buffer not started: postenable failed (%d)\n", ret); goto err_detach_pollfunc; } } return 0; err_detach_pollfunc: if (iio_dev_opaque->currentmode == INDIO_BUFFER_TRIGGERED) { iio_trigger_detach_poll_func(indio_dev->trig, indio_dev->pollfunc); } err_disable_buffers: buffer = list_prepare_entry(tmp, &iio_dev_opaque->buffer_list, buffer_list); list_for_each_entry_continue_reverse(buffer, &iio_dev_opaque->buffer_list, buffer_list) iio_buffer_disable(buffer, indio_dev); err_run_postdisable: if (indio_dev->setup_ops->postdisable) indio_dev->setup_ops->postdisable(indio_dev); err_undo_config: iio_dev_opaque->currentmode = INDIO_DIRECT_MODE; indio_dev->active_scan_mask = NULL; return ret; } static int iio_disable_buffers(struct iio_dev *indio_dev) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); struct iio_buffer *buffer; int ret = 0; int ret2; /* Wind down existing buffers - iff there are any */ if (list_empty(&iio_dev_opaque->buffer_list)) return 0; /* * If things go wrong at some step in disable we still need to continue * to perform the other steps, otherwise we leave the device in a * inconsistent state. We return the error code for the first error we * encountered. */ if (indio_dev->setup_ops->predisable) { ret2 = indio_dev->setup_ops->predisable(indio_dev); if (ret2 && !ret) ret = ret2; } if (iio_dev_opaque->currentmode == INDIO_BUFFER_TRIGGERED) { iio_trigger_detach_poll_func(indio_dev->trig, indio_dev->pollfunc); } list_for_each_entry(buffer, &iio_dev_opaque->buffer_list, buffer_list) { ret2 = iio_buffer_disable(buffer, indio_dev); if (ret2 && !ret) ret = ret2; } if (indio_dev->setup_ops->postdisable) { ret2 = indio_dev->setup_ops->postdisable(indio_dev); if (ret2 && !ret) ret = ret2; } iio_free_scan_mask(indio_dev, indio_dev->active_scan_mask); indio_dev->active_scan_mask = NULL; iio_dev_opaque->currentmode = INDIO_DIRECT_MODE; return ret; } static int __iio_update_buffers(struct iio_dev *indio_dev, struct iio_buffer *insert_buffer, struct iio_buffer *remove_buffer) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); struct iio_device_config new_config; int ret; ret = iio_verify_update(indio_dev, insert_buffer, remove_buffer, &new_config); if (ret) return ret; if (insert_buffer) { ret = iio_buffer_request_update(indio_dev, insert_buffer); if (ret) goto err_free_config; } ret = iio_disable_buffers(indio_dev); if (ret) goto err_deactivate_all; if (remove_buffer) iio_buffer_deactivate(remove_buffer); if (insert_buffer) iio_buffer_activate(indio_dev, insert_buffer); /* If no buffers in list, we are done */ if (list_empty(&iio_dev_opaque->buffer_list)) return 0; ret = iio_enable_buffers(indio_dev, &new_config); if (ret) goto err_deactivate_all; return 0; err_deactivate_all: /* * We've already verified that the config is valid earlier. If things go * wrong in either enable or disable the most likely reason is an IO * error from the device. In this case there is no good recovery * strategy. Just make sure to disable everything and leave the device * in a sane state. With a bit of luck the device might come back to * life again later and userspace can try again. */ iio_buffer_deactivate_all(indio_dev); err_free_config: iio_free_scan_mask(indio_dev, new_config.scan_mask); return ret; } int iio_update_buffers(struct iio_dev *indio_dev, struct iio_buffer *insert_buffer, struct iio_buffer *remove_buffer) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); int ret; if (insert_buffer == remove_buffer) return 0; if (insert_buffer && insert_buffer->direction == IIO_BUFFER_DIRECTION_OUT) return -EINVAL; mutex_lock(&iio_dev_opaque->info_exist_lock); mutex_lock(&iio_dev_opaque->mlock); if (insert_buffer && iio_buffer_is_active(insert_buffer)) insert_buffer = NULL; if (remove_buffer && !iio_buffer_is_active(remove_buffer)) remove_buffer = NULL; if (!insert_buffer && !remove_buffer) { ret = 0; goto out_unlock; } if (!indio_dev->info) { ret = -ENODEV; goto out_unlock; } ret = __iio_update_buffers(indio_dev, insert_buffer, remove_buffer); out_unlock: mutex_unlock(&iio_dev_opaque->mlock); mutex_unlock(&iio_dev_opaque->info_exist_lock); return ret; } EXPORT_SYMBOL_GPL(iio_update_buffers); void iio_disable_all_buffers(struct iio_dev *indio_dev) { iio_disable_buffers(indio_dev); iio_buffer_deactivate_all(indio_dev); } static ssize_t enable_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { int ret; bool requested_state; struct iio_dev *indio_dev = dev_to_iio_dev(dev); struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); struct iio_buffer *buffer = to_iio_dev_attr(attr)->buffer; bool inlist; ret = kstrtobool(buf, &requested_state); if (ret < 0) return ret; mutex_lock(&iio_dev_opaque->mlock); /* Find out if it is in the list */ inlist = iio_buffer_is_active(buffer); /* Already in desired state */ if (inlist == requested_state) goto done; if (requested_state) ret = __iio_update_buffers(indio_dev, buffer, NULL); else ret = __iio_update_buffers(indio_dev, NULL, buffer); done: mutex_unlock(&iio_dev_opaque->mlock); return (ret < 0) ? ret : len; } static ssize_t watermark_show(struct device *dev, struct device_attribute *attr, char *buf) { struct iio_buffer *buffer = to_iio_dev_attr(attr)->buffer; return sysfs_emit(buf, "%u\n", buffer->watermark); } static ssize_t watermark_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct iio_dev *indio_dev = dev_to_iio_dev(dev); struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); struct iio_buffer *buffer = to_iio_dev_attr(attr)->buffer; unsigned int val; int ret; ret = kstrtouint(buf, 10, &val); if (ret) return ret; if (!val) return -EINVAL; mutex_lock(&iio_dev_opaque->mlock); if (val > buffer->length) { ret = -EINVAL; goto out; } if (iio_buffer_is_active(buffer)) { ret = -EBUSY; goto out; } buffer->watermark = val; out: mutex_unlock(&iio_dev_opaque->mlock); return ret ? ret : len; } static ssize_t data_available_show(struct device *dev, struct device_attribute *attr, char *buf) { struct iio_buffer *buffer = to_iio_dev_attr(attr)->buffer; return sysfs_emit(buf, "%zu\n", iio_buffer_data_available(buffer)); } static ssize_t direction_show(struct device *dev, struct device_attribute *attr, char *buf) { struct iio_buffer *buffer = to_iio_dev_attr(attr)->buffer; switch (buffer->direction) { case IIO_BUFFER_DIRECTION_IN: return sysfs_emit(buf, "in\n"); case IIO_BUFFER_DIRECTION_OUT: return sysfs_emit(buf, "out\n"); default: return -EINVAL; } } static DEVICE_ATTR_RW(length); static struct device_attribute dev_attr_length_ro = __ATTR_RO(length); static DEVICE_ATTR_RW(enable); static DEVICE_ATTR_RW(watermark); static struct device_attribute dev_attr_watermark_ro = __ATTR_RO(watermark); static DEVICE_ATTR_RO(data_available); static DEVICE_ATTR_RO(direction); /* * When adding new attributes here, put the at the end, at least until * the code that handles the length/length_ro & watermark/watermark_ro * assignments gets cleaned up. Otherwise these can create some weird * duplicate attributes errors under some setups. */ static struct attribute *iio_buffer_attrs[] = { &dev_attr_length.attr, &dev_attr_enable.attr, &dev_attr_watermark.attr, &dev_attr_data_available.attr, &dev_attr_direction.attr, }; #define to_dev_attr(_attr) container_of(_attr, struct device_attribute, attr) static struct attribute *iio_buffer_wrap_attr(struct iio_buffer *buffer, struct attribute *attr) { struct device_attribute *dattr = to_dev_attr(attr); struct iio_dev_attr *iio_attr; iio_attr = kzalloc(sizeof(*iio_attr), GFP_KERNEL); if (!iio_attr) return NULL; iio_attr->buffer = buffer; memcpy(&iio_attr->dev_attr, dattr, sizeof(iio_attr->dev_attr)); iio_attr->dev_attr.attr.name = kstrdup_const(attr->name, GFP_KERNEL); if (!iio_attr->dev_attr.attr.name) { kfree(iio_attr); return NULL; } sysfs_attr_init(&iio_attr->dev_attr.attr); list_add(&iio_attr->l, &buffer->buffer_attr_list); return &iio_attr->dev_attr.attr; } static int iio_buffer_register_legacy_sysfs_groups(struct iio_dev *indio_dev, struct attribute **buffer_attrs, int buffer_attrcount, int scan_el_attrcount) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); struct attribute_group *group; struct attribute **attrs; int ret; attrs = kcalloc(buffer_attrcount + 1, sizeof(*attrs), GFP_KERNEL); if (!attrs) return -ENOMEM; memcpy(attrs, buffer_attrs, buffer_attrcount * sizeof(*attrs)); group = &iio_dev_opaque->legacy_buffer_group; group->attrs = attrs; group->name = "buffer"; ret = iio_device_register_sysfs_group(indio_dev, group); if (ret) goto error_free_buffer_attrs; attrs = kcalloc(scan_el_attrcount + 1, sizeof(*attrs), GFP_KERNEL); if (!attrs) { ret = -ENOMEM; goto error_free_buffer_attrs; } memcpy(attrs, &buffer_attrs[buffer_attrcount], scan_el_attrcount * sizeof(*attrs)); group = &iio_dev_opaque->legacy_scan_el_group; group->attrs = attrs; group->name = "scan_elements"; ret = iio_device_register_sysfs_group(indio_dev, group); if (ret) goto error_free_scan_el_attrs; return 0; error_free_scan_el_attrs: kfree(iio_dev_opaque->legacy_scan_el_group.attrs); error_free_buffer_attrs: kfree(iio_dev_opaque->legacy_buffer_group.attrs); return ret; } static void iio_buffer_unregister_legacy_sysfs_groups(struct iio_dev *indio_dev) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); kfree(iio_dev_opaque->legacy_buffer_group.attrs); kfree(iio_dev_opaque->legacy_scan_el_group.attrs); } static int iio_buffer_chrdev_release(struct inode *inode, struct file *filep) { struct iio_dev_buffer_pair *ib = filep->private_data; struct iio_dev *indio_dev = ib->indio_dev; struct iio_buffer *buffer = ib->buffer; wake_up(&buffer->pollq); kfree(ib); clear_bit(IIO_BUSY_BIT_POS, &buffer->flags); iio_device_put(indio_dev); return 0; } static const struct file_operations iio_buffer_chrdev_fileops = { .owner = THIS_MODULE, .llseek = noop_llseek, .read = iio_buffer_read, .write = iio_buffer_write, .poll = iio_buffer_poll, .release = iio_buffer_chrdev_release, }; static long iio_device_buffer_getfd(struct iio_dev *indio_dev, unsigned long arg) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); int __user *ival = (int __user *)arg; struct iio_dev_buffer_pair *ib; struct iio_buffer *buffer; int fd, idx, ret; if (copy_from_user(&idx, ival, sizeof(idx))) return -EFAULT; if (idx >= iio_dev_opaque->attached_buffers_cnt) return -ENODEV; iio_device_get(indio_dev); buffer = iio_dev_opaque->attached_buffers[idx]; if (test_and_set_bit(IIO_BUSY_BIT_POS, &buffer->flags)) { ret = -EBUSY; goto error_iio_dev_put; } ib = kzalloc(sizeof(*ib), GFP_KERNEL); if (!ib) { ret = -ENOMEM; goto error_clear_busy_bit; } ib->indio_dev = indio_dev; ib->buffer = buffer; fd = anon_inode_getfd("iio:buffer", &iio_buffer_chrdev_fileops, ib, O_RDWR | O_CLOEXEC); if (fd < 0) { ret = fd; goto error_free_ib; } if (copy_to_user(ival, &fd, sizeof(fd))) { /* * "Leak" the fd, as there's not much we can do about this * anyway. 'fd' might have been closed already, as * anon_inode_getfd() called fd_install() on it, which made * it reachable by userland. * * Instead of allowing a malicious user to play tricks with * us, rely on the process exit path to do any necessary * cleanup, as in releasing the file, if still needed. */ return -EFAULT; } return 0; error_free_ib: kfree(ib); error_clear_busy_bit: clear_bit(IIO_BUSY_BIT_POS, &buffer->flags); error_iio_dev_put: iio_device_put(indio_dev); return ret; } static long iio_device_buffer_ioctl(struct iio_dev *indio_dev, struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { case IIO_BUFFER_GET_FD_IOCTL: return iio_device_buffer_getfd(indio_dev, arg); default: return IIO_IOCTL_UNHANDLED; } } static int __iio_buffer_alloc_sysfs_and_mask(struct iio_buffer *buffer, struct iio_dev *indio_dev, int index) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); struct iio_dev_attr *p; const struct iio_dev_attr *id_attr; struct attribute **attr; int ret, i, attrn, scan_el_attrcount, buffer_attrcount; const struct iio_chan_spec *channels; buffer_attrcount = 0; if (buffer->attrs) { while (buffer->attrs[buffer_attrcount]) buffer_attrcount++; } buffer_attrcount += ARRAY_SIZE(iio_buffer_attrs); scan_el_attrcount = 0; INIT_LIST_HEAD(&buffer->buffer_attr_list); channels = indio_dev->channels; if (channels) { /* new magic */ for (i = 0; i < indio_dev->num_channels; i++) { if (channels[i].scan_index < 0) continue; /* Verify that sample bits fit into storage */ if (channels[i].scan_type.storagebits < channels[i].scan_type.realbits + channels[i].scan_type.shift) { dev_err(&indio_dev->dev, "Channel %d storagebits (%d) < shifted realbits (%d + %d)\n", i, channels[i].scan_type.storagebits, channels[i].scan_type.realbits, channels[i].scan_type.shift); ret = -EINVAL; goto error_cleanup_dynamic; } ret = iio_buffer_add_channel_sysfs(indio_dev, buffer, &channels[i]); if (ret < 0) goto error_cleanup_dynamic; scan_el_attrcount += ret; if (channels[i].type == IIO_TIMESTAMP) iio_dev_opaque->scan_index_timestamp = channels[i].scan_index; } if (indio_dev->masklength && !buffer->scan_mask) { buffer->scan_mask = bitmap_zalloc(indio_dev->masklength, GFP_KERNEL); if (!buffer->scan_mask) { ret = -ENOMEM; goto error_cleanup_dynamic; } } } attrn = buffer_attrcount + scan_el_attrcount; attr = kcalloc(attrn + 1, sizeof(*attr), GFP_KERNEL); if (!attr) { ret = -ENOMEM; goto error_free_scan_mask; } memcpy(attr, iio_buffer_attrs, sizeof(iio_buffer_attrs)); if (!buffer->access->set_length) attr[0] = &dev_attr_length_ro.attr; if (buffer->access->flags & INDIO_BUFFER_FLAG_FIXED_WATERMARK) attr[2] = &dev_attr_watermark_ro.attr; if (buffer->attrs) for (i = 0, id_attr = buffer->attrs[i]; (id_attr = buffer->attrs[i]); i++) attr[ARRAY_SIZE(iio_buffer_attrs) + i] = (struct attribute *)&id_attr->dev_attr.attr; buffer->buffer_group.attrs = attr; for (i = 0; i < buffer_attrcount; i++) { struct attribute *wrapped; wrapped = iio_buffer_wrap_attr(buffer, attr[i]); if (!wrapped) { ret = -ENOMEM; goto error_free_buffer_attrs; } attr[i] = wrapped; } attrn = 0; list_for_each_entry(p, &buffer->buffer_attr_list, l) attr[attrn++] = &p->dev_attr.attr; buffer->buffer_group.name = kasprintf(GFP_KERNEL, "buffer%d", index); if (!buffer->buffer_group.name) { ret = -ENOMEM; goto error_free_buffer_attrs; } ret = iio_device_register_sysfs_group(indio_dev, &buffer->buffer_group); if (ret) goto error_free_buffer_attr_group_name; /* we only need to register the legacy groups for the first buffer */ if (index > 0) return 0; ret = iio_buffer_register_legacy_sysfs_groups(indio_dev, attr, buffer_attrcount, scan_el_attrcount); if (ret) goto error_free_buffer_attr_group_name; return 0; error_free_buffer_attr_group_name: kfree(buffer->buffer_group.name); error_free_buffer_attrs: kfree(buffer->buffer_group.attrs); error_free_scan_mask: bitmap_free(buffer->scan_mask); error_cleanup_dynamic: iio_free_chan_devattr_list(&buffer->buffer_attr_list); return ret; } static void __iio_buffer_free_sysfs_and_mask(struct iio_buffer *buffer, struct iio_dev *indio_dev, int index) { if (index == 0) iio_buffer_unregister_legacy_sysfs_groups(indio_dev); bitmap_free(buffer->scan_mask); kfree(buffer->buffer_group.name); kfree(buffer->buffer_group.attrs); iio_free_chan_devattr_list(&buffer->buffer_attr_list); } int iio_buffers_alloc_sysfs_and_mask(struct iio_dev *indio_dev) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); const struct iio_chan_spec *channels; struct iio_buffer *buffer; int ret, i, idx; size_t sz; channels = indio_dev->channels; if (channels) { int ml = indio_dev->masklength; for (i = 0; i < indio_dev->num_channels; i++) ml = max(ml, channels[i].scan_index + 1); indio_dev->masklength = ml; } if (!iio_dev_opaque->attached_buffers_cnt) return 0; for (idx = 0; idx < iio_dev_opaque->attached_buffers_cnt; idx++) { buffer = iio_dev_opaque->attached_buffers[idx]; ret = __iio_buffer_alloc_sysfs_and_mask(buffer, indio_dev, idx); if (ret) goto error_unwind_sysfs_and_mask; } sz = sizeof(*iio_dev_opaque->buffer_ioctl_handler); iio_dev_opaque->buffer_ioctl_handler = kzalloc(sz, GFP_KERNEL); if (!iio_dev_opaque->buffer_ioctl_handler) { ret = -ENOMEM; goto error_unwind_sysfs_and_mask; } iio_dev_opaque->buffer_ioctl_handler->ioctl = iio_device_buffer_ioctl; iio_device_ioctl_handler_register(indio_dev, iio_dev_opaque->buffer_ioctl_handler); return 0; error_unwind_sysfs_and_mask: while (idx--) { buffer = iio_dev_opaque->attached_buffers[idx]; __iio_buffer_free_sysfs_and_mask(buffer, indio_dev, idx); } return ret; } void iio_buffers_free_sysfs_and_mask(struct iio_dev *indio_dev) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); struct iio_buffer *buffer; int i; if (!iio_dev_opaque->attached_buffers_cnt) return; iio_device_ioctl_handler_unregister(iio_dev_opaque->buffer_ioctl_handler); kfree(iio_dev_opaque->buffer_ioctl_handler); for (i = iio_dev_opaque->attached_buffers_cnt - 1; i >= 0; i--) { buffer = iio_dev_opaque->attached_buffers[i]; __iio_buffer_free_sysfs_and_mask(buffer, indio_dev, i); } } /** * iio_validate_scan_mask_onehot() - Validates that exactly one channel is selected * @indio_dev: the iio device * @mask: scan mask to be checked * * Return true if exactly one bit is set in the scan mask, false otherwise. It * can be used for devices where only one channel can be active for sampling at * a time. */ bool iio_validate_scan_mask_onehot(struct iio_dev *indio_dev, const unsigned long *mask) { return bitmap_weight(mask, indio_dev->masklength) == 1; } EXPORT_SYMBOL_GPL(iio_validate_scan_mask_onehot); static const void *iio_demux(struct iio_buffer *buffer, const void *datain) { struct iio_demux_table *t; if (list_empty(&buffer->demux_list)) return datain; list_for_each_entry(t, &buffer->demux_list, l) memcpy(buffer->demux_bounce + t->to, datain + t->from, t->length); return buffer->demux_bounce; } static int iio_push_to_buffer(struct iio_buffer *buffer, const void *data) { const void *dataout = iio_demux(buffer, data); int ret; ret = buffer->access->store_to(buffer, dataout); if (ret) return ret; /* * We can't just test for watermark to decide if we wake the poll queue * because read may request less samples than the watermark. */ wake_up_interruptible_poll(&buffer->pollq, EPOLLIN | EPOLLRDNORM); return 0; } /** * iio_push_to_buffers() - push to a registered buffer. * @indio_dev: iio_dev structure for device. * @data: Full scan. */ int iio_push_to_buffers(struct iio_dev *indio_dev, const void *data) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); int ret; struct iio_buffer *buf; list_for_each_entry(buf, &iio_dev_opaque->buffer_list, buffer_list) { ret = iio_push_to_buffer(buf, data); if (ret < 0) return ret; } return 0; } EXPORT_SYMBOL_GPL(iio_push_to_buffers); /** * iio_push_to_buffers_with_ts_unaligned() - push to registered buffer, * no alignment or space requirements. * @indio_dev: iio_dev structure for device. * @data: channel data excluding the timestamp. * @data_sz: size of data. * @timestamp: timestamp for the sample data. * * This special variant of iio_push_to_buffers_with_timestamp() does * not require space for the timestamp, or 8 byte alignment of data. * It does however require an allocation on first call and additional * copies on all calls, so should be avoided if possible. */ int iio_push_to_buffers_with_ts_unaligned(struct iio_dev *indio_dev, const void *data, size_t data_sz, int64_t timestamp) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); /* * Conservative estimate - we can always safely copy the minimum * of either the data provided or the length of the destination buffer. * This relaxed limit allows the calling drivers to be lax about * tracking the size of the data they are pushing, at the cost of * unnecessary copying of padding. */ data_sz = min_t(size_t, indio_dev->scan_bytes, data_sz); if (iio_dev_opaque->bounce_buffer_size != indio_dev->scan_bytes) { void *bb; bb = devm_krealloc(&indio_dev->dev, iio_dev_opaque->bounce_buffer, indio_dev->scan_bytes, GFP_KERNEL); if (!bb) return -ENOMEM; iio_dev_opaque->bounce_buffer = bb; iio_dev_opaque->bounce_buffer_size = indio_dev->scan_bytes; } memcpy(iio_dev_opaque->bounce_buffer, data, data_sz); return iio_push_to_buffers_with_timestamp(indio_dev, iio_dev_opaque->bounce_buffer, timestamp); } EXPORT_SYMBOL_GPL(iio_push_to_buffers_with_ts_unaligned); /** * iio_buffer_release() - Free a buffer's resources * @ref: Pointer to the kref embedded in the iio_buffer struct * * This function is called when the last reference to the buffer has been * dropped. It will typically free all resources allocated by the buffer. Do not * call this function manually, always use iio_buffer_put() when done using a * buffer. */ static void iio_buffer_release(struct kref *ref) { struct iio_buffer *buffer = container_of(ref, struct iio_buffer, ref); buffer->access->release(buffer); } /** * iio_buffer_get() - Grab a reference to the buffer * @buffer: The buffer to grab a reference for, may be NULL * * Returns the pointer to the buffer that was passed into the function. */ struct iio_buffer *iio_buffer_get(struct iio_buffer *buffer) { if (buffer) kref_get(&buffer->ref); return buffer; } EXPORT_SYMBOL_GPL(iio_buffer_get); /** * iio_buffer_put() - Release the reference to the buffer * @buffer: The buffer to release the reference for, may be NULL */ void iio_buffer_put(struct iio_buffer *buffer) { if (buffer) kref_put(&buffer->ref, iio_buffer_release); } EXPORT_SYMBOL_GPL(iio_buffer_put); /** * iio_device_attach_buffer - Attach a buffer to a IIO device * @indio_dev: The device the buffer should be attached to * @buffer: The buffer to attach to the device * * Return 0 if successful, negative if error. * * This function attaches a buffer to a IIO device. The buffer stays attached to * the device until the device is freed. For legacy reasons, the first attached * buffer will also be assigned to 'indio_dev->buffer'. * The array allocated here, will be free'd via the iio_device_detach_buffers() * call which is handled by the iio_device_free(). */ int iio_device_attach_buffer(struct iio_dev *indio_dev, struct iio_buffer *buffer) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); struct iio_buffer **new, **old = iio_dev_opaque->attached_buffers; unsigned int cnt = iio_dev_opaque->attached_buffers_cnt; cnt++; new = krealloc(old, sizeof(*new) * cnt, GFP_KERNEL); if (!new) return -ENOMEM; iio_dev_opaque->attached_buffers = new; buffer = iio_buffer_get(buffer); /* first buffer is legacy; attach it to the IIO device directly */ if (!indio_dev->buffer) indio_dev->buffer = buffer; iio_dev_opaque->attached_buffers[cnt - 1] = buffer; iio_dev_opaque->attached_buffers_cnt = cnt; return 0; } EXPORT_SYMBOL_GPL(iio_device_attach_buffer); |
13 3 1 1 8 1 7 10 379 365 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/netlink.h> #include <linux/nospec.h> #include <linux/rtnetlink.h> #include <linux/types.h> #include <net/ip.h> #include <net/net_namespace.h> #include <net/tcp.h> static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len, u32 *metrics, struct netlink_ext_ack *extack) { bool ecn_ca = false; struct nlattr *nla; int remaining; nla_for_each_attr(nla, fc_mx, fc_mx_len, remaining) { int type = nla_type(nla); u32 val; if (!type) continue; if (type > RTAX_MAX) { NL_SET_ERR_MSG(extack, "Invalid metric type"); return -EINVAL; } type = array_index_nospec(type, RTAX_MAX + 1); if (type == RTAX_CC_ALGO) { char tmp[TCP_CA_NAME_MAX]; nla_strscpy(tmp, nla, sizeof(tmp)); val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca); if (val == TCP_CA_UNSPEC) { NL_SET_ERR_MSG(extack, "Unknown tcp congestion algorithm"); return -EINVAL; } } else { if (nla_len(nla) != sizeof(u32)) { NL_SET_ERR_MSG_ATTR(extack, nla, "Invalid attribute in metrics"); return -EINVAL; } val = nla_get_u32(nla); } if (type == RTAX_ADVMSS && val > 65535 - 40) val = 65535 - 40; if (type == RTAX_MTU && val > 65535 - 15) val = 65535 - 15; if (type == RTAX_HOPLIMIT && val > 255) val = 255; if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) { NL_SET_ERR_MSG(extack, "Unknown flag set in feature mask in metrics attribute"); return -EINVAL; } metrics[type - 1] = val; } if (ecn_ca) metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; return 0; } struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx, int fc_mx_len, struct netlink_ext_ack *extack) { struct dst_metrics *fib_metrics; int err; if (!fc_mx) return (struct dst_metrics *)&dst_default_metrics; fib_metrics = kzalloc(sizeof(*fib_metrics), GFP_KERNEL); if (unlikely(!fib_metrics)) return ERR_PTR(-ENOMEM); err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics, extack); if (!err) { refcount_set(&fib_metrics->refcnt, 1); } else { kfree(fib_metrics); fib_metrics = ERR_PTR(err); } return fib_metrics; } EXPORT_SYMBOL_GPL(ip_fib_metrics_init); |
2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | /* * videobuf2-memops.c - generic memory handling routines for videobuf2 * * Copyright (C) 2010 Samsung Electronics * * Author: Pawel Osciak <pawel@osciak.com> * Marek Szyprowski <m.szyprowski@samsung.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation. */ #include <linux/slab.h> #include <linux/module.h> #include <linux/dma-mapping.h> #include <linux/vmalloc.h> #include <linux/mm.h> #include <linux/sched.h> #include <linux/file.h> #include <media/videobuf2-v4l2.h> #include <media/videobuf2-memops.h> /** * vb2_create_framevec() - map virtual addresses to pfns * @start: Virtual user address where we start mapping * @length: Length of a range to map * @write: Should we map for writing into the area * * This function allocates and fills in a vector with pfns corresponding to * virtual address range passed in arguments. If pfns have corresponding pages, * page references are also grabbed to pin pages in memory. The function * returns pointer to the vector on success and error pointer in case of * failure. Returned vector needs to be freed via vb2_destroy_pfnvec(). */ struct frame_vector *vb2_create_framevec(unsigned long start, unsigned long length, bool write) { int ret; unsigned long first, last; unsigned long nr; struct frame_vector *vec; first = start >> PAGE_SHIFT; last = (start + length - 1) >> PAGE_SHIFT; nr = last - first + 1; vec = frame_vector_create(nr); if (!vec) return ERR_PTR(-ENOMEM); ret = get_vaddr_frames(start & PAGE_MASK, nr, write, vec); if (ret < 0) goto out_destroy; /* We accept only complete set of PFNs */ if (ret != nr) { ret = -EFAULT; goto out_release; } return vec; out_release: put_vaddr_frames(vec); out_destroy: frame_vector_destroy(vec); return ERR_PTR(ret); } EXPORT_SYMBOL(vb2_create_framevec); /** * vb2_destroy_framevec() - release vector of mapped pfns * @vec: vector of pfns / pages to release * * This releases references to all pages in the vector @vec (if corresponding * pfns are backed by pages) and frees the passed vector. */ void vb2_destroy_framevec(struct frame_vector *vec) { put_vaddr_frames(vec); frame_vector_destroy(vec); } EXPORT_SYMBOL(vb2_destroy_framevec); /** * vb2_common_vm_open() - increase refcount of the vma * @vma: virtual memory region for the mapping * * This function adds another user to the provided vma. It expects * struct vb2_vmarea_handler pointer in vma->vm_private_data. */ static void vb2_common_vm_open(struct vm_area_struct *vma) { struct vb2_vmarea_handler *h = vma->vm_private_data; pr_debug("%s: %p, refcount: %d, vma: %08lx-%08lx\n", __func__, h, refcount_read(h->refcount), vma->vm_start, vma->vm_end); refcount_inc(h->refcount); } /** * vb2_common_vm_close() - decrease refcount of the vma * @vma: virtual memory region for the mapping * * This function releases the user from the provided vma. It expects * struct vb2_vmarea_handler pointer in vma->vm_private_data. */ static void vb2_common_vm_close(struct vm_area_struct *vma) { struct vb2_vmarea_handler *h = vma->vm_private_data; pr_debug("%s: %p, refcount: %d, vma: %08lx-%08lx\n", __func__, h, refcount_read(h->refcount), vma->vm_start, vma->vm_end); h->put(h->arg); } /* * vb2_common_vm_ops - common vm_ops used for tracking refcount of mmapped * video buffers */ const struct vm_operations_struct vb2_common_vm_ops = { .open = vb2_common_vm_open, .close = vb2_common_vm_close, }; EXPORT_SYMBOL_GPL(vb2_common_vm_ops); MODULE_DESCRIPTION("common memory handling routines for videobuf2"); MODULE_AUTHOR("Pawel Osciak <pawel@osciak.com>"); MODULE_LICENSE("GPL"); |
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 | // SPDX-License-Identifier: GPL-2.0-or-later /* * cx2341x - generic code for cx23415/6/8 based devices * * Copyright (C) 2006 Hans Verkuil <hverkuil@xs4all.nl> */ #include <linux/module.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/types.h> #include <linux/videodev2.h> #include <media/tuner.h> #include <media/drv-intf/cx2341x.h> #include <media/v4l2-common.h> MODULE_DESCRIPTION("cx23415/6/8 driver"); MODULE_AUTHOR("Hans Verkuil"); MODULE_LICENSE("GPL"); static int debug; module_param(debug, int, 0644); MODULE_PARM_DESC(debug, "Debug level (0-1)"); /********************** COMMON CODE *********************/ /* definitions for audio properties bits 29-28 */ #define CX2341X_AUDIO_ENCODING_METHOD_MPEG 0 #define CX2341X_AUDIO_ENCODING_METHOD_AC3 1 #define CX2341X_AUDIO_ENCODING_METHOD_LPCM 2 static const char *cx2341x_get_name(u32 id) { switch (id) { case V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE: return "Spatial Filter Mode"; case V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER: return "Spatial Filter"; case V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE: return "Spatial Luma Filter Type"; case V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_SPATIAL_FILTER_TYPE: return "Spatial Chroma Filter Type"; case V4L2_CID_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE: return "Temporal Filter Mode"; case V4L2_CID_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER: return "Temporal Filter"; case V4L2_CID_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE: return "Median Filter Type"; case V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_MEDIAN_FILTER_TOP: return "Median Luma Filter Maximum"; case V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_MEDIAN_FILTER_BOTTOM: return "Median Luma Filter Minimum"; case V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_MEDIAN_FILTER_TOP: return "Median Chroma Filter Maximum"; case V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_MEDIAN_FILTER_BOTTOM: return "Median Chroma Filter Minimum"; case V4L2_CID_MPEG_CX2341X_STREAM_INSERT_NAV_PACKETS: return "Insert Navigation Packets"; } return NULL; } static const char **cx2341x_get_menu(u32 id) { static const char *cx2341x_video_spatial_filter_mode_menu[] = { "Manual", "Auto", NULL }; static const char *cx2341x_video_luma_spatial_filter_type_menu[] = { "Off", "1D Horizontal", "1D Vertical", "2D H/V Separable", "2D Symmetric non-separable", NULL }; static const char *cx2341x_video_chroma_spatial_filter_type_menu[] = { "Off", "1D Horizontal", NULL }; static const char *cx2341x_video_temporal_filter_mode_menu[] = { "Manual", "Auto", NULL }; static const char *cx2341x_video_median_filter_type_menu[] = { "Off", "Horizontal", "Vertical", "Horizontal/Vertical", "Diagonal", NULL }; switch (id) { case V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE: return cx2341x_video_spatial_filter_mode_menu; case V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE: return cx2341x_video_luma_spatial_filter_type_menu; case V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_SPATIAL_FILTER_TYPE: return cx2341x_video_chroma_spatial_filter_type_menu; case V4L2_CID_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE: return cx2341x_video_temporal_filter_mode_menu; case V4L2_CID_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE: return cx2341x_video_median_filter_type_menu; } return NULL; } static void cx2341x_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type, s32 *min, s32 *max, s32 *step, s32 *def, u32 *flags) { *name = cx2341x_get_name(id); *flags = 0; switch (id) { case V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE: case V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE: case V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_SPATIAL_FILTER_TYPE: case V4L2_CID_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE: case V4L2_CID_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE: *type = V4L2_CTRL_TYPE_MENU; *min = 0; *step = 0; break; case V4L2_CID_MPEG_CX2341X_STREAM_INSERT_NAV_PACKETS: *type = V4L2_CTRL_TYPE_BOOLEAN; *min = 0; *max = *step = 1; break; default: *type = V4L2_CTRL_TYPE_INTEGER; break; } switch (id) { case V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE: case V4L2_CID_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE: case V4L2_CID_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE: *flags |= V4L2_CTRL_FLAG_UPDATE; break; case V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER: case V4L2_CID_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER: case V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_MEDIAN_FILTER_TOP: case V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_MEDIAN_FILTER_BOTTOM: case V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_MEDIAN_FILTER_TOP: case V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_MEDIAN_FILTER_BOTTOM: *flags |= V4L2_CTRL_FLAG_SLIDER; break; case V4L2_CID_MPEG_VIDEO_ENCODING: *flags |= V4L2_CTRL_FLAG_READ_ONLY; break; } } /********************** OLD CODE *********************/ /* Must be sorted from low to high control ID! */ const u32 cx2341x_mpeg_ctrls[] = { V4L2_CID_CODEC_CLASS, V4L2_CID_MPEG_STREAM_TYPE, V4L2_CID_MPEG_STREAM_VBI_FMT, V4L2_CID_MPEG_AUDIO_SAMPLING_FREQ, V4L2_CID_MPEG_AUDIO_ENCODING, V4L2_CID_MPEG_AUDIO_L2_BITRATE, V4L2_CID_MPEG_AUDIO_MODE, V4L2_CID_MPEG_AUDIO_MODE_EXTENSION, V4L2_CID_MPEG_AUDIO_EMPHASIS, V4L2_CID_MPEG_AUDIO_CRC, V4L2_CID_MPEG_AUDIO_MUTE, V4L2_CID_MPEG_AUDIO_AC3_BITRATE, V4L2_CID_MPEG_VIDEO_ENCODING, V4L2_CID_MPEG_VIDEO_ASPECT, V4L2_CID_MPEG_VIDEO_B_FRAMES, V4L2_CID_MPEG_VIDEO_GOP_SIZE, V4L2_CID_MPEG_VIDEO_GOP_CLOSURE, V4L2_CID_MPEG_VIDEO_BITRATE_MODE, V4L2_CID_MPEG_VIDEO_BITRATE, V4L2_CID_MPEG_VIDEO_BITRATE_PEAK, V4L2_CID_MPEG_VIDEO_TEMPORAL_DECIMATION, V4L2_CID_MPEG_VIDEO_MUTE, V4L2_CID_MPEG_VIDEO_MUTE_YUV, V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE, V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER, V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE, V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_SPATIAL_FILTER_TYPE, V4L2_CID_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE, V4L2_CID_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER, V4L2_CID_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE, V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_MEDIAN_FILTER_BOTTOM, V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_MEDIAN_FILTER_TOP, V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_MEDIAN_FILTER_BOTTOM, V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_MEDIAN_FILTER_TOP, V4L2_CID_MPEG_CX2341X_STREAM_INSERT_NAV_PACKETS, 0 }; EXPORT_SYMBOL(cx2341x_mpeg_ctrls); static const struct cx2341x_mpeg_params default_params = { /* misc */ .capabilities = 0, .port = CX2341X_PORT_MEMORY, .width = 720, .height = 480, .is_50hz = 0, /* stream */ .stream_type = V4L2_MPEG_STREAM_TYPE_MPEG2_PS, .stream_vbi_fmt = V4L2_MPEG_STREAM_VBI_FMT_NONE, .stream_insert_nav_packets = 0, /* audio */ .audio_sampling_freq = V4L2_MPEG_AUDIO_SAMPLING_FREQ_48000, .audio_encoding = V4L2_MPEG_AUDIO_ENCODING_LAYER_2, .audio_l2_bitrate = V4L2_MPEG_AUDIO_L2_BITRATE_224K, .audio_ac3_bitrate = V4L2_MPEG_AUDIO_AC3_BITRATE_224K, .audio_mode = V4L2_MPEG_AUDIO_MODE_STEREO, .audio_mode_extension = V4L2_MPEG_AUDIO_MODE_EXTENSION_BOUND_4, .audio_emphasis = V4L2_MPEG_AUDIO_EMPHASIS_NONE, .audio_crc = V4L2_MPEG_AUDIO_CRC_NONE, .audio_mute = 0, /* video */ .video_encoding = V4L2_MPEG_VIDEO_ENCODING_MPEG_2, .video_aspect = V4L2_MPEG_VIDEO_ASPECT_4x3, .video_b_frames = 2, .video_gop_size = 12, .video_gop_closure = 1, .video_bitrate_mode = V4L2_MPEG_VIDEO_BITRATE_MODE_VBR, .video_bitrate = 6000000, .video_bitrate_peak = 8000000, .video_temporal_decimation = 0, .video_mute = 0, .video_mute_yuv = 0x008080, /* YCbCr value for black */ /* encoding filters */ .video_spatial_filter_mode = V4L2_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE_MANUAL, .video_spatial_filter = 0, .video_luma_spatial_filter_type = V4L2_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE_1D_HOR, .video_chroma_spatial_filter_type = V4L2_MPEG_CX2341X_VIDEO_CHROMA_SPATIAL_FILTER_TYPE_1D_HOR, .video_temporal_filter_mode = V4L2_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE_MANUAL, .video_temporal_filter = 8, .video_median_filter_type = V4L2_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE_OFF, .video_luma_median_filter_top = 255, .video_luma_median_filter_bottom = 0, .video_chroma_median_filter_top = 255, .video_chroma_median_filter_bottom = 0, }; /* Map the control ID to the correct field in the cx2341x_mpeg_params struct. Return -EINVAL if the ID is unknown, else return 0. */ static int cx2341x_get_ctrl(const struct cx2341x_mpeg_params *params, struct v4l2_ext_control *ctrl) { switch (ctrl->id) { case V4L2_CID_MPEG_AUDIO_SAMPLING_FREQ: ctrl->value = params->audio_sampling_freq; break; case V4L2_CID_MPEG_AUDIO_ENCODING: ctrl->value = params->audio_encoding; break; case V4L2_CID_MPEG_AUDIO_L2_BITRATE: ctrl->value = params->audio_l2_bitrate; break; case V4L2_CID_MPEG_AUDIO_AC3_BITRATE: ctrl->value = params->audio_ac3_bitrate; break; case V4L2_CID_MPEG_AUDIO_MODE: ctrl->value = params->audio_mode; break; case V4L2_CID_MPEG_AUDIO_MODE_EXTENSION: ctrl->value = params->audio_mode_extension; break; case V4L2_CID_MPEG_AUDIO_EMPHASIS: ctrl->value = params->audio_emphasis; break; case V4L2_CID_MPEG_AUDIO_CRC: ctrl->value = params->audio_crc; break; case V4L2_CID_MPEG_AUDIO_MUTE: ctrl->value = params->audio_mute; break; case V4L2_CID_MPEG_VIDEO_ENCODING: ctrl->value = params->video_encoding; break; case V4L2_CID_MPEG_VIDEO_ASPECT: ctrl->value = params->video_aspect; break; case V4L2_CID_MPEG_VIDEO_B_FRAMES: ctrl->value = params->video_b_frames; break; case V4L2_CID_MPEG_VIDEO_GOP_SIZE: ctrl->value = params->video_gop_size; break; case V4L2_CID_MPEG_VIDEO_GOP_CLOSURE: ctrl->value = params->video_gop_closure; break; case V4L2_CID_MPEG_VIDEO_BITRATE_MODE: ctrl->value = params->video_bitrate_mode; break; case V4L2_CID_MPEG_VIDEO_BITRATE: ctrl->value = params->video_bitrate; break; case V4L2_CID_MPEG_VIDEO_BITRATE_PEAK: ctrl->value = params->video_bitrate_peak; break; case V4L2_CID_MPEG_VIDEO_TEMPORAL_DECIMATION: ctrl->value = params->video_temporal_decimation; break; case V4L2_CID_MPEG_VIDEO_MUTE: ctrl->value = params->video_mute; break; case V4L2_CID_MPEG_VIDEO_MUTE_YUV: ctrl->value = params->video_mute_yuv; break; case V4L2_CID_MPEG_STREAM_TYPE: ctrl->value = params->stream_type; break; case V4L2_CID_MPEG_STREAM_VBI_FMT: ctrl->value = params->stream_vbi_fmt; break; case V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE: ctrl->value = params->video_spatial_filter_mode; break; case V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER: ctrl->value = params->video_spatial_filter; break; case V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE: ctrl->value = params->video_luma_spatial_filter_type; break; case V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_SPATIAL_FILTER_TYPE: ctrl->value = params->video_chroma_spatial_filter_type; break; case V4L2_CID_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE: ctrl->value = params->video_temporal_filter_mode; break; case V4L2_CID_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER: ctrl->value = params->video_temporal_filter; break; case V4L2_CID_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE: ctrl->value = params->video_median_filter_type; break; case V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_MEDIAN_FILTER_TOP: ctrl->value = params->video_luma_median_filter_top; break; case V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_MEDIAN_FILTER_BOTTOM: ctrl->value = params->video_luma_median_filter_bottom; break; case V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_MEDIAN_FILTER_TOP: ctrl->value = params->video_chroma_median_filter_top; break; case V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_MEDIAN_FILTER_BOTTOM: ctrl->value = params->video_chroma_median_filter_bottom; break; case V4L2_CID_MPEG_CX2341X_STREAM_INSERT_NAV_PACKETS: ctrl->value = params->stream_insert_nav_packets; break; default: return -EINVAL; } return 0; } /* Map the control ID to the correct field in the cx2341x_mpeg_params struct. Return -EINVAL if the ID is unknown, else return 0. */ static int cx2341x_set_ctrl(struct cx2341x_mpeg_params *params, int busy, struct v4l2_ext_control *ctrl) { switch (ctrl->id) { case V4L2_CID_MPEG_AUDIO_SAMPLING_FREQ: if (busy) return -EBUSY; params->audio_sampling_freq = ctrl->value; break; case V4L2_CID_MPEG_AUDIO_ENCODING: if (busy) return -EBUSY; if (params->capabilities & CX2341X_CAP_HAS_AC3) if (ctrl->value != V4L2_MPEG_AUDIO_ENCODING_LAYER_2 && ctrl->value != V4L2_MPEG_AUDIO_ENCODING_AC3) return -ERANGE; params->audio_encoding = ctrl->value; break; case V4L2_CID_MPEG_AUDIO_L2_BITRATE: if (busy) return -EBUSY; params->audio_l2_bitrate = ctrl->value; break; case V4L2_CID_MPEG_AUDIO_AC3_BITRATE: if (busy) return -EBUSY; if (!(params->capabilities & CX2341X_CAP_HAS_AC3)) return -EINVAL; params->audio_ac3_bitrate = ctrl->value; break; case V4L2_CID_MPEG_AUDIO_MODE: params->audio_mode = ctrl->value; break; case V4L2_CID_MPEG_AUDIO_MODE_EXTENSION: params->audio_mode_extension = ctrl->value; break; case V4L2_CID_MPEG_AUDIO_EMPHASIS: params->audio_emphasis = ctrl->value; break; case V4L2_CID_MPEG_AUDIO_CRC: params->audio_crc = ctrl->value; break; case V4L2_CID_MPEG_AUDIO_MUTE: params->audio_mute = ctrl->value; break; case V4L2_CID_MPEG_VIDEO_ASPECT: params->video_aspect = ctrl->value; break; case V4L2_CID_MPEG_VIDEO_B_FRAMES: { int b = ctrl->value + 1; int gop = params->video_gop_size; params->video_b_frames = ctrl->value; params->video_gop_size = b * ((gop + b - 1) / b); /* Max GOP size = 34 */ while (params->video_gop_size > 34) params->video_gop_size -= b; break; } case V4L2_CID_MPEG_VIDEO_GOP_SIZE: { int b = params->video_b_frames + 1; int gop = ctrl->value; params->video_gop_size = b * ((gop + b - 1) / b); /* Max GOP size = 34 */ while (params->video_gop_size > 34) params->video_gop_size -= b; ctrl->value = params->video_gop_size; break; } case V4L2_CID_MPEG_VIDEO_GOP_CLOSURE: params->video_gop_closure = ctrl->value; break; case V4L2_CID_MPEG_VIDEO_BITRATE_MODE: if (busy) return -EBUSY; /* MPEG-1 only allows CBR */ if (params->video_encoding == V4L2_MPEG_VIDEO_ENCODING_MPEG_1 && ctrl->value != V4L2_MPEG_VIDEO_BITRATE_MODE_CBR) return -EINVAL; params->video_bitrate_mode = ctrl->value; break; case V4L2_CID_MPEG_VIDEO_BITRATE: if (busy) return -EBUSY; params->video_bitrate = ctrl->value; break; case V4L2_CID_MPEG_VIDEO_BITRATE_PEAK: if (busy) return -EBUSY; params->video_bitrate_peak = ctrl->value; break; case V4L2_CID_MPEG_VIDEO_TEMPORAL_DECIMATION: params->video_temporal_decimation = ctrl->value; break; case V4L2_CID_MPEG_VIDEO_MUTE: params->video_mute = (ctrl->value != 0); break; case V4L2_CID_MPEG_VIDEO_MUTE_YUV: params->video_mute_yuv = ctrl->value; break; case V4L2_CID_MPEG_STREAM_TYPE: if (busy) return -EBUSY; params->stream_type = ctrl->value; params->video_encoding = (params->stream_type == V4L2_MPEG_STREAM_TYPE_MPEG1_SS || params->stream_type == V4L2_MPEG_STREAM_TYPE_MPEG1_VCD) ? V4L2_MPEG_VIDEO_ENCODING_MPEG_1 : V4L2_MPEG_VIDEO_ENCODING_MPEG_2; if (params->video_encoding == V4L2_MPEG_VIDEO_ENCODING_MPEG_1) /* MPEG-1 implies CBR */ params->video_bitrate_mode = V4L2_MPEG_VIDEO_BITRATE_MODE_CBR; break; case V4L2_CID_MPEG_STREAM_VBI_FMT: params->stream_vbi_fmt = ctrl->value; break; case V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE: params->video_spatial_filter_mode = ctrl->value; break; case V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER: params->video_spatial_filter = ctrl->value; break; case V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE: params->video_luma_spatial_filter_type = ctrl->value; break; case V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_SPATIAL_FILTER_TYPE: params->video_chroma_spatial_filter_type = ctrl->value; break; case V4L2_CID_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE: params->video_temporal_filter_mode = ctrl->value; break; case V4L2_CID_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER: params->video_temporal_filter = ctrl->value; break; case V4L2_CID_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE: params->video_median_filter_type = ctrl->value; break; case V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_MEDIAN_FILTER_TOP: params->video_luma_median_filter_top = ctrl->value; break; case V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_MEDIAN_FILTER_BOTTOM: params->video_luma_median_filter_bottom = ctrl->value; break; case V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_MEDIAN_FILTER_TOP: params->video_chroma_median_filter_top = ctrl->value; break; case V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_MEDIAN_FILTER_BOTTOM: params->video_chroma_median_filter_bottom = ctrl->value; break; case V4L2_CID_MPEG_CX2341X_STREAM_INSERT_NAV_PACKETS: params->stream_insert_nav_packets = ctrl->value; break; default: return -EINVAL; } return 0; } static int cx2341x_ctrl_query_fill(struct v4l2_queryctrl *qctrl, s32 min, s32 max, s32 step, s32 def) { const char *name; switch (qctrl->id) { /* MPEG controls */ case V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE: case V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER: case V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE: case V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_SPATIAL_FILTER_TYPE: case V4L2_CID_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE: case V4L2_CID_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER: case V4L2_CID_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE: case V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_MEDIAN_FILTER_TOP: case V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_MEDIAN_FILTER_BOTTOM: case V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_MEDIAN_FILTER_TOP: case V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_MEDIAN_FILTER_BOTTOM: case V4L2_CID_MPEG_CX2341X_STREAM_INSERT_NAV_PACKETS: cx2341x_ctrl_fill(qctrl->id, &name, &qctrl->type, &min, &max, &step, &def, &qctrl->flags); qctrl->minimum = min; qctrl->maximum = max; qctrl->step = step; qctrl->default_value = def; qctrl->reserved[0] = qctrl->reserved[1] = 0; strscpy(qctrl->name, name, sizeof(qctrl->name)); return 0; default: return v4l2_ctrl_query_fill(qctrl, min, max, step, def); } } int cx2341x_ctrl_query(const struct cx2341x_mpeg_params *params, struct v4l2_queryctrl *qctrl) { int err; switch (qctrl->id) { case V4L2_CID_CODEC_CLASS: return v4l2_ctrl_query_fill(qctrl, 0, 0, 0, 0); case V4L2_CID_MPEG_STREAM_TYPE: return v4l2_ctrl_query_fill(qctrl, V4L2_MPEG_STREAM_TYPE_MPEG2_PS, V4L2_MPEG_STREAM_TYPE_MPEG2_SVCD, 1, V4L2_MPEG_STREAM_TYPE_MPEG2_PS); case V4L2_CID_MPEG_STREAM_VBI_FMT: if (params->capabilities & CX2341X_CAP_HAS_SLICED_VBI) return v4l2_ctrl_query_fill(qctrl, V4L2_MPEG_STREAM_VBI_FMT_NONE, V4L2_MPEG_STREAM_VBI_FMT_IVTV, 1, V4L2_MPEG_STREAM_VBI_FMT_NONE); return cx2341x_ctrl_query_fill(qctrl, V4L2_MPEG_STREAM_VBI_FMT_NONE, V4L2_MPEG_STREAM_VBI_FMT_NONE, 1, default_params.stream_vbi_fmt); case V4L2_CID_MPEG_AUDIO_SAMPLING_FREQ: return v4l2_ctrl_query_fill(qctrl, V4L2_MPEG_AUDIO_SAMPLING_FREQ_44100, V4L2_MPEG_AUDIO_SAMPLING_FREQ_32000, 1, V4L2_MPEG_AUDIO_SAMPLING_FREQ_48000); case V4L2_CID_MPEG_AUDIO_ENCODING: if (params->capabilities & CX2341X_CAP_HAS_AC3) { /* * The state of L2 & AC3 bitrate controls can change * when this control changes, but v4l2_ctrl_query_fill() * already sets V4L2_CTRL_FLAG_UPDATE for * V4L2_CID_MPEG_AUDIO_ENCODING, so we don't here. */ return v4l2_ctrl_query_fill(qctrl, V4L2_MPEG_AUDIO_ENCODING_LAYER_2, V4L2_MPEG_AUDIO_ENCODING_AC3, 1, default_params.audio_encoding); } return v4l2_ctrl_query_fill(qctrl, V4L2_MPEG_AUDIO_ENCODING_LAYER_2, V4L2_MPEG_AUDIO_ENCODING_LAYER_2, 1, default_params.audio_encoding); case V4L2_CID_MPEG_AUDIO_L2_BITRATE: err = v4l2_ctrl_query_fill(qctrl, V4L2_MPEG_AUDIO_L2_BITRATE_192K, V4L2_MPEG_AUDIO_L2_BITRATE_384K, 1, default_params.audio_l2_bitrate); if (err) return err; if (params->capabilities & CX2341X_CAP_HAS_AC3 && params->audio_encoding != V4L2_MPEG_AUDIO_ENCODING_LAYER_2) qctrl->flags |= V4L2_CTRL_FLAG_INACTIVE; return 0; case V4L2_CID_MPEG_AUDIO_MODE: return v4l2_ctrl_query_fill(qctrl, V4L2_MPEG_AUDIO_MODE_STEREO, V4L2_MPEG_AUDIO_MODE_MONO, 1, V4L2_MPEG_AUDIO_MODE_STEREO); case V4L2_CID_MPEG_AUDIO_MODE_EXTENSION: err = v4l2_ctrl_query_fill(qctrl, V4L2_MPEG_AUDIO_MODE_EXTENSION_BOUND_4, V4L2_MPEG_AUDIO_MODE_EXTENSION_BOUND_16, 1, V4L2_MPEG_AUDIO_MODE_EXTENSION_BOUND_4); if (err == 0 && params->audio_mode != V4L2_MPEG_AUDIO_MODE_JOINT_STEREO) qctrl->flags |= V4L2_CTRL_FLAG_INACTIVE; return err; case V4L2_CID_MPEG_AUDIO_EMPHASIS: return v4l2_ctrl_query_fill(qctrl, V4L2_MPEG_AUDIO_EMPHASIS_NONE, V4L2_MPEG_AUDIO_EMPHASIS_CCITT_J17, 1, V4L2_MPEG_AUDIO_EMPHASIS_NONE); case V4L2_CID_MPEG_AUDIO_CRC: return v4l2_ctrl_query_fill(qctrl, V4L2_MPEG_AUDIO_CRC_NONE, V4L2_MPEG_AUDIO_CRC_CRC16, 1, V4L2_MPEG_AUDIO_CRC_NONE); case V4L2_CID_MPEG_AUDIO_MUTE: return v4l2_ctrl_query_fill(qctrl, 0, 1, 1, 0); case V4L2_CID_MPEG_AUDIO_AC3_BITRATE: err = v4l2_ctrl_query_fill(qctrl, V4L2_MPEG_AUDIO_AC3_BITRATE_48K, V4L2_MPEG_AUDIO_AC3_BITRATE_448K, 1, default_params.audio_ac3_bitrate); if (err) return err; if (params->capabilities & CX2341X_CAP_HAS_AC3) { if (params->audio_encoding != V4L2_MPEG_AUDIO_ENCODING_AC3) qctrl->flags |= V4L2_CTRL_FLAG_INACTIVE; } else qctrl->flags |= V4L2_CTRL_FLAG_DISABLED; return 0; case V4L2_CID_MPEG_VIDEO_ENCODING: /* this setting is read-only for the cx2341x since the V4L2_CID_MPEG_STREAM_TYPE really determines the MPEG-1/2 setting */ err = v4l2_ctrl_query_fill(qctrl, V4L2_MPEG_VIDEO_ENCODING_MPEG_1, V4L2_MPEG_VIDEO_ENCODING_MPEG_2, 1, V4L2_MPEG_VIDEO_ENCODING_MPEG_2); if (err == 0) qctrl->flags |= V4L2_CTRL_FLAG_READ_ONLY; return err; case V4L2_CID_MPEG_VIDEO_ASPECT: return v4l2_ctrl_query_fill(qctrl, V4L2_MPEG_VIDEO_ASPECT_1x1, V4L2_MPEG_VIDEO_ASPECT_221x100, 1, V4L2_MPEG_VIDEO_ASPECT_4x3); case V4L2_CID_MPEG_VIDEO_B_FRAMES: return v4l2_ctrl_query_fill(qctrl, 0, 33, 1, 2); case V4L2_CID_MPEG_VIDEO_GOP_SIZE: return v4l2_ctrl_query_fill(qctrl, 1, 34, 1, params->is_50hz ? 12 : 15); case V4L2_CID_MPEG_VIDEO_GOP_CLOSURE: return v4l2_ctrl_query_fill(qctrl, 0, 1, 1, 1); case V4L2_CID_MPEG_VIDEO_BITRATE_MODE: err = v4l2_ctrl_query_fill(qctrl, V4L2_MPEG_VIDEO_BITRATE_MODE_VBR, V4L2_MPEG_VIDEO_BITRATE_MODE_CBR, 1, V4L2_MPEG_VIDEO_BITRATE_MODE_VBR); if (err == 0 && params->video_encoding == V4L2_MPEG_VIDEO_ENCODING_MPEG_1) qctrl->flags |= V4L2_CTRL_FLAG_INACTIVE; return err; case V4L2_CID_MPEG_VIDEO_BITRATE: return v4l2_ctrl_query_fill(qctrl, 0, 27000000, 1, 6000000); case V4L2_CID_MPEG_VIDEO_BITRATE_PEAK: err = v4l2_ctrl_query_fill(qctrl, 0, 27000000, 1, 8000000); if (err == 0 && params->video_bitrate_mode == V4L2_MPEG_VIDEO_BITRATE_MODE_CBR) qctrl->flags |= V4L2_CTRL_FLAG_INACTIVE; return err; case V4L2_CID_MPEG_VIDEO_TEMPORAL_DECIMATION: return v4l2_ctrl_query_fill(qctrl, 0, 255, 1, 0); case V4L2_CID_MPEG_VIDEO_MUTE: return v4l2_ctrl_query_fill(qctrl, 0, 1, 1, 0); case V4L2_CID_MPEG_VIDEO_MUTE_YUV: /* Init YUV (really YCbCr) to black */ return v4l2_ctrl_query_fill(qctrl, 0, 0xffffff, 1, 0x008080); /* CX23415/6 specific */ case V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE: return cx2341x_ctrl_query_fill(qctrl, V4L2_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE_MANUAL, V4L2_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE_AUTO, 1, default_params.video_spatial_filter_mode); case V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER: cx2341x_ctrl_query_fill(qctrl, 0, 15, 1, default_params.video_spatial_filter); qctrl->flags |= V4L2_CTRL_FLAG_SLIDER; if (params->video_spatial_filter_mode == V4L2_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE_AUTO) qctrl->flags |= V4L2_CTRL_FLAG_INACTIVE; return 0; case V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE: cx2341x_ctrl_query_fill(qctrl, V4L2_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE_OFF, V4L2_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE_2D_SYM_NON_SEPARABLE, 1, default_params.video_luma_spatial_filter_type); if (params->video_spatial_filter_mode == V4L2_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE_AUTO) qctrl->flags |= V4L2_CTRL_FLAG_INACTIVE; return 0; case V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_SPATIAL_FILTER_TYPE: cx2341x_ctrl_query_fill(qctrl, V4L2_MPEG_CX2341X_VIDEO_CHROMA_SPATIAL_FILTER_TYPE_OFF, V4L2_MPEG_CX2341X_VIDEO_CHROMA_SPATIAL_FILTER_TYPE_1D_HOR, 1, default_params.video_chroma_spatial_filter_type); if (params->video_spatial_filter_mode == V4L2_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE_AUTO) qctrl->flags |= V4L2_CTRL_FLAG_INACTIVE; return 0; case V4L2_CID_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE: return cx2341x_ctrl_query_fill(qctrl, V4L2_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE_MANUAL, V4L2_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE_AUTO, 1, default_params.video_temporal_filter_mode); case V4L2_CID_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER: cx2341x_ctrl_query_fill(qctrl, 0, 31, 1, default_params.video_temporal_filter); qctrl->flags |= V4L2_CTRL_FLAG_SLIDER; if (params->video_temporal_filter_mode == V4L2_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE_AUTO) qctrl->flags |= V4L2_CTRL_FLAG_INACTIVE; return 0; case V4L2_CID_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE: return cx2341x_ctrl_query_fill(qctrl, V4L2_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE_OFF, V4L2_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE_DIAG, 1, default_params.video_median_filter_type); case V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_MEDIAN_FILTER_TOP: cx2341x_ctrl_query_fill(qctrl, 0, 255, 1, default_params.video_luma_median_filter_top); qctrl->flags |= V4L2_CTRL_FLAG_SLIDER; if (params->video_median_filter_type == V4L2_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE_OFF) qctrl->flags |= V4L2_CTRL_FLAG_INACTIVE; return 0; case V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_MEDIAN_FILTER_BOTTOM: cx2341x_ctrl_query_fill(qctrl, 0, 255, 1, default_params.video_luma_median_filter_bottom); qctrl->flags |= V4L2_CTRL_FLAG_SLIDER; if (params->video_median_filter_type == V4L2_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE_OFF) qctrl->flags |= V4L2_CTRL_FLAG_INACTIVE; return 0; case V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_MEDIAN_FILTER_TOP: cx2341x_ctrl_query_fill(qctrl, 0, 255, 1, default_params.video_chroma_median_filter_top); qctrl->flags |= V4L2_CTRL_FLAG_SLIDER; if (params->video_median_filter_type == V4L2_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE_OFF) qctrl->flags |= V4L2_CTRL_FLAG_INACTIVE; return 0; case V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_MEDIAN_FILTER_BOTTOM: cx2341x_ctrl_query_fill(qctrl, 0, 255, 1, default_params.video_chroma_median_filter_bottom); qctrl->flags |= V4L2_CTRL_FLAG_SLIDER; if (params->video_median_filter_type == V4L2_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE_OFF) qctrl->flags |= V4L2_CTRL_FLAG_INACTIVE; return 0; case V4L2_CID_MPEG_CX2341X_STREAM_INSERT_NAV_PACKETS: return cx2341x_ctrl_query_fill(qctrl, 0, 1, 1, default_params.stream_insert_nav_packets); default: return -EINVAL; } } EXPORT_SYMBOL(cx2341x_ctrl_query); const char * const *cx2341x_ctrl_get_menu(const struct cx2341x_mpeg_params *p, u32 id) { static const char * const mpeg_stream_type_without_ts[] = { "MPEG-2 Program Stream", "", "MPEG-1 System Stream", "MPEG-2 DVD-compatible Stream", "MPEG-1 VCD-compatible Stream", "MPEG-2 SVCD-compatible Stream", NULL }; static const char *mpeg_stream_type_with_ts[] = { "MPEG-2 Program Stream", "MPEG-2 Transport Stream", "MPEG-1 System Stream", "MPEG-2 DVD-compatible Stream", "MPEG-1 VCD-compatible Stream", "MPEG-2 SVCD-compatible Stream", NULL }; static const char *mpeg_audio_encoding_l2_ac3[] = { "", "MPEG-1/2 Layer II", "", "", "AC-3", NULL }; switch (id) { case V4L2_CID_MPEG_STREAM_TYPE: return (p->capabilities & CX2341X_CAP_HAS_TS) ? mpeg_stream_type_with_ts : mpeg_stream_type_without_ts; case V4L2_CID_MPEG_AUDIO_ENCODING: return (p->capabilities & CX2341X_CAP_HAS_AC3) ? mpeg_audio_encoding_l2_ac3 : v4l2_ctrl_get_menu(id); case V4L2_CID_MPEG_AUDIO_L1_BITRATE: case V4L2_CID_MPEG_AUDIO_L3_BITRATE: return NULL; case V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE: case V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE: case V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_SPATIAL_FILTER_TYPE: case V4L2_CID_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE: case V4L2_CID_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE: return cx2341x_get_menu(id); default: return v4l2_ctrl_get_menu(id); } } EXPORT_SYMBOL(cx2341x_ctrl_get_menu); static void cx2341x_calc_audio_properties(struct cx2341x_mpeg_params *params) { params->audio_properties = (params->audio_sampling_freq << 0) | (params->audio_mode << 8) | (params->audio_mode_extension << 10) | (((params->audio_emphasis == V4L2_MPEG_AUDIO_EMPHASIS_CCITT_J17) ? 3 : params->audio_emphasis) << 12) | (params->audio_crc << 14); if ((params->capabilities & CX2341X_CAP_HAS_AC3) && params->audio_encoding == V4L2_MPEG_AUDIO_ENCODING_AC3) { params->audio_properties |= /* Not sure if this MPEG Layer II setting is required */ ((3 - V4L2_MPEG_AUDIO_ENCODING_LAYER_2) << 2) | (params->audio_ac3_bitrate << 4) | (CX2341X_AUDIO_ENCODING_METHOD_AC3 << 28); } else { /* Assuming MPEG Layer II */ params->audio_properties |= ((3 - params->audio_encoding) << 2) | ((1 + params->audio_l2_bitrate) << 4); } } /* Check for correctness of the ctrl's value based on the data from struct v4l2_queryctrl and the available menu items. Note that menu_items may be NULL, in that case it is ignored. */ static int v4l2_ctrl_check(struct v4l2_ext_control *ctrl, struct v4l2_queryctrl *qctrl, const char * const *menu_items) { if (qctrl->flags & V4L2_CTRL_FLAG_DISABLED) return -EINVAL; if (qctrl->flags & V4L2_CTRL_FLAG_GRABBED) return -EBUSY; if (qctrl->type == V4L2_CTRL_TYPE_STRING) return 0; if (qctrl->type == V4L2_CTRL_TYPE_BUTTON || qctrl->type == V4L2_CTRL_TYPE_INTEGER64 || qctrl->type == V4L2_CTRL_TYPE_CTRL_CLASS) return 0; if (ctrl->value < qctrl->minimum || ctrl->value > qctrl->maximum) return -ERANGE; if (qctrl->type == V4L2_CTRL_TYPE_MENU && menu_items != NULL) { if (menu_items[ctrl->value] == NULL || menu_items[ctrl->value][0] == '\0') return -EINVAL; } if (qctrl->type == V4L2_CTRL_TYPE_BITMASK && (ctrl->value & ~qctrl->maximum)) return -ERANGE; return 0; } int cx2341x_ext_ctrls(struct cx2341x_mpeg_params *params, int busy, struct v4l2_ext_controls *ctrls, unsigned int cmd) { int err = 0; int i; if (cmd == VIDIOC_G_EXT_CTRLS) { for (i = 0; i < ctrls->count; i++) { struct v4l2_ext_control *ctrl = ctrls->controls + i; err = cx2341x_get_ctrl(params, ctrl); if (err) { ctrls->error_idx = i; break; } } return err; } for (i = 0; i < ctrls->count; i++) { struct v4l2_ext_control *ctrl = ctrls->controls + i; struct v4l2_queryctrl qctrl; const char * const *menu_items = NULL; qctrl.id = ctrl->id; err = cx2341x_ctrl_query(params, &qctrl); if (err) break; if (qctrl.type == V4L2_CTRL_TYPE_MENU) menu_items = cx2341x_ctrl_get_menu(params, qctrl.id); err = v4l2_ctrl_check(ctrl, &qctrl, menu_items); if (err) break; err = cx2341x_set_ctrl(params, busy, ctrl); if (err) break; } if (err == 0 && params->video_bitrate_mode == V4L2_MPEG_VIDEO_BITRATE_MODE_VBR && params->video_bitrate_peak < params->video_bitrate) { err = -ERANGE; ctrls->error_idx = ctrls->count; } if (err) ctrls->error_idx = i; else cx2341x_calc_audio_properties(params); return err; } EXPORT_SYMBOL(cx2341x_ext_ctrls); void cx2341x_fill_defaults(struct cx2341x_mpeg_params *p) { *p = default_params; cx2341x_calc_audio_properties(p); } EXPORT_SYMBOL(cx2341x_fill_defaults); static int cx2341x_api(void *priv, cx2341x_mbox_func func, u32 cmd, int args, ...) { u32 data[CX2341X_MBOX_MAX_DATA]; va_list vargs; int i; va_start(vargs, args); for (i = 0; i < args; i++) data[i] = va_arg(vargs, int); va_end(vargs); return func(priv, cmd, args, 0, data); } #define CMP_FIELD(__old, __new, __field) (__old->__field != __new->__field) int cx2341x_update(void *priv, cx2341x_mbox_func func, const struct cx2341x_mpeg_params *old, const struct cx2341x_mpeg_params *new) { static int mpeg_stream_type[] = { 0, /* MPEG-2 PS */ 1, /* MPEG-2 TS */ 2, /* MPEG-1 SS */ 14, /* DVD */ 11, /* VCD */ 12, /* SVCD */ }; int err; cx2341x_api(priv, func, CX2341X_ENC_SET_OUTPUT_PORT, 2, new->port, 0); if (!old || CMP_FIELD(old, new, is_50hz)) { err = cx2341x_api(priv, func, CX2341X_ENC_SET_FRAME_RATE, 1, new->is_50hz); if (err) return err; } if (!old || CMP_FIELD(old, new, width) || CMP_FIELD(old, new, height) || CMP_FIELD(old, new, video_encoding)) { u16 w = new->width; u16 h = new->height; if (new->video_encoding == V4L2_MPEG_VIDEO_ENCODING_MPEG_1) { w /= 2; h /= 2; } err = cx2341x_api(priv, func, CX2341X_ENC_SET_FRAME_SIZE, 2, h, w); if (err) return err; } if (!old || CMP_FIELD(old, new, stream_type)) { err = cx2341x_api(priv, func, CX2341X_ENC_SET_STREAM_TYPE, 1, mpeg_stream_type[new->stream_type]); if (err) return err; } if (!old || CMP_FIELD(old, new, video_aspect)) { err = cx2341x_api(priv, func, CX2341X_ENC_SET_ASPECT_RATIO, 1, 1 + new->video_aspect); if (err) return err; } if (!old || CMP_FIELD(old, new, video_b_frames) || CMP_FIELD(old, new, video_gop_size)) { err = cx2341x_api(priv, func, CX2341X_ENC_SET_GOP_PROPERTIES, 2, new->video_gop_size, new->video_b_frames + 1); if (err) return err; } if (!old || CMP_FIELD(old, new, video_gop_closure)) { err = cx2341x_api(priv, func, CX2341X_ENC_SET_GOP_CLOSURE, 1, new->video_gop_closure); if (err) return err; } if (!old || CMP_FIELD(old, new, audio_properties)) { err = cx2341x_api(priv, func, CX2341X_ENC_SET_AUDIO_PROPERTIES, 1, new->audio_properties); if (err) return err; } if (!old || CMP_FIELD(old, new, audio_mute)) { err = cx2341x_api(priv, func, CX2341X_ENC_MUTE_AUDIO, 1, new->audio_mute); if (err) return err; } if (!old || CMP_FIELD(old, new, video_bitrate_mode) || CMP_FIELD(old, new, video_bitrate) || CMP_FIELD(old, new, video_bitrate_peak)) { err = cx2341x_api(priv, func, CX2341X_ENC_SET_BIT_RATE, 5, new->video_bitrate_mode, new->video_bitrate, new->video_bitrate_peak / 400, 0, 0); if (err) return err; } if (!old || CMP_FIELD(old, new, video_spatial_filter_mode) || CMP_FIELD(old, new, video_temporal_filter_mode) || CMP_FIELD(old, new, video_median_filter_type)) { err = cx2341x_api(priv, func, CX2341X_ENC_SET_DNR_FILTER_MODE, 2, new->video_spatial_filter_mode | (new->video_temporal_filter_mode << 1), new->video_median_filter_type); if (err) return err; } if (!old || CMP_FIELD(old, new, video_luma_median_filter_bottom) || CMP_FIELD(old, new, video_luma_median_filter_top) || CMP_FIELD(old, new, video_chroma_median_filter_bottom) || CMP_FIELD(old, new, video_chroma_median_filter_top)) { err = cx2341x_api(priv, func, CX2341X_ENC_SET_CORING_LEVELS, 4, new->video_luma_median_filter_bottom, new->video_luma_median_filter_top, new->video_chroma_median_filter_bottom, new->video_chroma_median_filter_top); if (err) return err; } if (!old || CMP_FIELD(old, new, video_luma_spatial_filter_type) || CMP_FIELD(old, new, video_chroma_spatial_filter_type)) { err = cx2341x_api(priv, func, CX2341X_ENC_SET_SPATIAL_FILTER_TYPE, 2, new->video_luma_spatial_filter_type, new->video_chroma_spatial_filter_type); if (err) return err; } if (!old || CMP_FIELD(old, new, video_spatial_filter) || CMP_FIELD(old, new, video_temporal_filter)) { err = cx2341x_api(priv, func, CX2341X_ENC_SET_DNR_FILTER_PROPS, 2, new->video_spatial_filter, new->video_temporal_filter); if (err) return err; } if (!old || CMP_FIELD(old, new, video_temporal_decimation)) { err = cx2341x_api(priv, func, CX2341X_ENC_SET_FRAME_DROP_RATE, 1, new->video_temporal_decimation); if (err) return err; } if (!old || CMP_FIELD(old, new, video_mute) || (new->video_mute && CMP_FIELD(old, new, video_mute_yuv))) { err = cx2341x_api(priv, func, CX2341X_ENC_MUTE_VIDEO, 1, new->video_mute | (new->video_mute_yuv << 8)); if (err) return err; } if (!old || CMP_FIELD(old, new, stream_insert_nav_packets)) { err = cx2341x_api(priv, func, CX2341X_ENC_MISC, 2, 7, new->stream_insert_nav_packets); if (err) return err; } return 0; } EXPORT_SYMBOL(cx2341x_update); static const char *cx2341x_menu_item(const struct cx2341x_mpeg_params *p, u32 id) { const char * const *menu = cx2341x_ctrl_get_menu(p, id); struct v4l2_ext_control ctrl; if (menu == NULL) goto invalid; ctrl.id = id; if (cx2341x_get_ctrl(p, &ctrl)) goto invalid; while (ctrl.value-- && *menu) menu++; if (*menu == NULL) goto invalid; return *menu; invalid: return "<invalid>"; } void cx2341x_log_status(const struct cx2341x_mpeg_params *p, const char *prefix) { int is_mpeg1 = p->video_encoding == V4L2_MPEG_VIDEO_ENCODING_MPEG_1; /* Stream */ printk(KERN_INFO "%s: Stream: %s", prefix, cx2341x_menu_item(p, V4L2_CID_MPEG_STREAM_TYPE)); if (p->stream_insert_nav_packets) printk(KERN_CONT " (with navigation packets)"); printk(KERN_CONT "\n"); printk(KERN_INFO "%s: VBI Format: %s\n", prefix, cx2341x_menu_item(p, V4L2_CID_MPEG_STREAM_VBI_FMT)); /* Video */ printk(KERN_INFO "%s: Video: %dx%d, %d fps%s\n", prefix, p->width / (is_mpeg1 ? 2 : 1), p->height / (is_mpeg1 ? 2 : 1), p->is_50hz ? 25 : 30, (p->video_mute) ? " (muted)" : ""); printk(KERN_INFO "%s: Video: %s, %s, %s, %d", prefix, cx2341x_menu_item(p, V4L2_CID_MPEG_VIDEO_ENCODING), cx2341x_menu_item(p, V4L2_CID_MPEG_VIDEO_ASPECT), cx2341x_menu_item(p, V4L2_CID_MPEG_VIDEO_BITRATE_MODE), p->video_bitrate); if (p->video_bitrate_mode == V4L2_MPEG_VIDEO_BITRATE_MODE_VBR) printk(KERN_CONT ", Peak %d", p->video_bitrate_peak); printk(KERN_CONT "\n"); printk(KERN_INFO "%s: Video: GOP Size %d, %d B-Frames, %sGOP Closure\n", prefix, p->video_gop_size, p->video_b_frames, p->video_gop_closure ? "" : "No "); if (p->video_temporal_decimation) printk(KERN_INFO "%s: Video: Temporal Decimation %d\n", prefix, p->video_temporal_decimation); /* Audio */ printk(KERN_INFO "%s: Audio: %s, %s, %s, %s%s", prefix, cx2341x_menu_item(p, V4L2_CID_MPEG_AUDIO_SAMPLING_FREQ), cx2341x_menu_item(p, V4L2_CID_MPEG_AUDIO_ENCODING), cx2341x_menu_item(p, p->audio_encoding == V4L2_MPEG_AUDIO_ENCODING_AC3 ? V4L2_CID_MPEG_AUDIO_AC3_BITRATE : V4L2_CID_MPEG_AUDIO_L2_BITRATE), cx2341x_menu_item(p, V4L2_CID_MPEG_AUDIO_MODE), p->audio_mute ? " (muted)" : ""); if (p->audio_mode == V4L2_MPEG_AUDIO_MODE_JOINT_STEREO) printk(KERN_CONT ", %s", cx2341x_menu_item(p, V4L2_CID_MPEG_AUDIO_MODE_EXTENSION)); printk(KERN_CONT ", %s, %s\n", cx2341x_menu_item(p, V4L2_CID_MPEG_AUDIO_EMPHASIS), cx2341x_menu_item(p, V4L2_CID_MPEG_AUDIO_CRC)); /* Encoding filters */ printk(KERN_INFO "%s: Spatial Filter: %s, Luma %s, Chroma %s, %d\n", prefix, cx2341x_menu_item(p, V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE), cx2341x_menu_item(p, V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE), cx2341x_menu_item(p, V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_SPATIAL_FILTER_TYPE), p->video_spatial_filter); printk(KERN_INFO "%s: Temporal Filter: %s, %d\n", prefix, cx2341x_menu_item(p, V4L2_CID_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE), p->video_temporal_filter); printk(KERN_INFO "%s: Median Filter: %s, Luma [%d, %d], Chroma [%d, %d]\n", prefix, cx2341x_menu_item(p, V4L2_CID_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE), p->video_luma_median_filter_bottom, p->video_luma_median_filter_top, p->video_chroma_median_filter_bottom, p->video_chroma_median_filter_top); } EXPORT_SYMBOL(cx2341x_log_status); /********************** NEW CODE *********************/ static inline struct cx2341x_handler *to_cxhdl(struct v4l2_ctrl *ctrl) { return container_of(ctrl->handler, struct cx2341x_handler, hdl); } static int cx2341x_hdl_api(struct cx2341x_handler *hdl, u32 cmd, int args, ...) { u32 data[CX2341X_MBOX_MAX_DATA]; va_list vargs; int i; va_start(vargs, args); for (i = 0; i < args; i++) data[i] = va_arg(vargs, int); va_end(vargs); return hdl->func(hdl->priv, cmd, args, 0, data); } /* ctrl->handler->lock is held, so it is safe to access cur.val */ static inline int cx2341x_neq(struct v4l2_ctrl *ctrl) { return ctrl && ctrl->val != ctrl->cur.val; } static int cx2341x_try_ctrl(struct v4l2_ctrl *ctrl) { struct cx2341x_handler *hdl = to_cxhdl(ctrl); s32 val = ctrl->val; switch (ctrl->id) { case V4L2_CID_MPEG_VIDEO_B_FRAMES: { /* video gop cluster */ int b = val + 1; int gop = hdl->video_gop_size->val; gop = b * ((gop + b - 1) / b); /* Max GOP size = 34 */ while (gop > 34) gop -= b; hdl->video_gop_size->val = gop; break; } case V4L2_CID_MPEG_STREAM_TYPE: /* stream type cluster */ hdl->video_encoding->val = (hdl->stream_type->val == V4L2_MPEG_STREAM_TYPE_MPEG1_SS || hdl->stream_type->val == V4L2_MPEG_STREAM_TYPE_MPEG1_VCD) ? V4L2_MPEG_VIDEO_ENCODING_MPEG_1 : V4L2_MPEG_VIDEO_ENCODING_MPEG_2; if (hdl->video_encoding->val == V4L2_MPEG_VIDEO_ENCODING_MPEG_1) /* MPEG-1 implies CBR */ hdl->video_bitrate_mode->val = V4L2_MPEG_VIDEO_BITRATE_MODE_CBR; /* peak bitrate shall be >= normal bitrate */ if (hdl->video_bitrate_mode->val == V4L2_MPEG_VIDEO_BITRATE_MODE_VBR && hdl->video_bitrate_peak->val < hdl->video_bitrate->val) hdl->video_bitrate_peak->val = hdl->video_bitrate->val; break; } return 0; } static int cx2341x_s_ctrl(struct v4l2_ctrl *ctrl) { static const int mpeg_stream_type[] = { 0, /* MPEG-2 PS */ 1, /* MPEG-2 TS */ 2, /* MPEG-1 SS */ 14, /* DVD */ 11, /* VCD */ 12, /* SVCD */ }; struct cx2341x_handler *hdl = to_cxhdl(ctrl); s32 val = ctrl->val; u32 props; int err; switch (ctrl->id) { case V4L2_CID_MPEG_STREAM_VBI_FMT: if (hdl->ops && hdl->ops->s_stream_vbi_fmt) return hdl->ops->s_stream_vbi_fmt(hdl, val); return 0; case V4L2_CID_MPEG_VIDEO_ASPECT: return cx2341x_hdl_api(hdl, CX2341X_ENC_SET_ASPECT_RATIO, 1, val + 1); case V4L2_CID_MPEG_VIDEO_GOP_CLOSURE: return cx2341x_hdl_api(hdl, CX2341X_ENC_SET_GOP_CLOSURE, 1, val); case V4L2_CID_MPEG_AUDIO_MUTE: return cx2341x_hdl_api(hdl, CX2341X_ENC_MUTE_AUDIO, 1, val); case V4L2_CID_MPEG_VIDEO_TEMPORAL_DECIMATION: return cx2341x_hdl_api(hdl, CX2341X_ENC_SET_FRAME_DROP_RATE, 1, val); case V4L2_CID_MPEG_CX2341X_STREAM_INSERT_NAV_PACKETS: return cx2341x_hdl_api(hdl, CX2341X_ENC_MISC, 2, 7, val); case V4L2_CID_MPEG_AUDIO_SAMPLING_FREQ: /* audio properties cluster */ props = (hdl->audio_sampling_freq->val << 0) | (hdl->audio_mode->val << 8) | (hdl->audio_mode_extension->val << 10) | (hdl->audio_crc->val << 14); if (hdl->audio_emphasis->val == V4L2_MPEG_AUDIO_EMPHASIS_CCITT_J17) props |= 3 << 12; else props |= hdl->audio_emphasis->val << 12; if (hdl->audio_encoding->val == V4L2_MPEG_AUDIO_ENCODING_AC3) { props |= #if 1 /* Not sure if this MPEG Layer II setting is required */ ((3 - V4L2_MPEG_AUDIO_ENCODING_LAYER_2) << 2) | #endif (hdl->audio_ac3_bitrate->val << 4) | (CX2341X_AUDIO_ENCODING_METHOD_AC3 << 28); } else { /* Assuming MPEG Layer II */ props |= ((3 - hdl->audio_encoding->val) << 2) | ((1 + hdl->audio_l2_bitrate->val) << 4); } err = cx2341x_hdl_api(hdl, CX2341X_ENC_SET_AUDIO_PROPERTIES, 1, props); if (err) return err; hdl->audio_properties = props; if (hdl->audio_ac3_bitrate) { int is_ac3 = hdl->audio_encoding->val == V4L2_MPEG_AUDIO_ENCODING_AC3; v4l2_ctrl_activate(hdl->audio_ac3_bitrate, is_ac3); v4l2_ctrl_activate(hdl->audio_l2_bitrate, !is_ac3); } v4l2_ctrl_activate(hdl->audio_mode_extension, hdl->audio_mode->val == V4L2_MPEG_AUDIO_MODE_JOINT_STEREO); if (cx2341x_neq(hdl->audio_sampling_freq) && hdl->ops && hdl->ops->s_audio_sampling_freq) return hdl->ops->s_audio_sampling_freq(hdl, hdl->audio_sampling_freq->val); if (cx2341x_neq(hdl->audio_mode) && hdl->ops && hdl->ops->s_audio_mode) return hdl->ops->s_audio_mode(hdl, hdl->audio_mode->val); return 0; case V4L2_CID_MPEG_VIDEO_B_FRAMES: /* video gop cluster */ return cx2341x_hdl_api(hdl, CX2341X_ENC_SET_GOP_PROPERTIES, 2, hdl->video_gop_size->val, hdl->video_b_frames->val + 1); case V4L2_CID_MPEG_STREAM_TYPE: /* stream type cluster */ err = cx2341x_hdl_api(hdl, CX2341X_ENC_SET_STREAM_TYPE, 1, mpeg_stream_type[val]); if (err) return err; err = cx2341x_hdl_api(hdl, CX2341X_ENC_SET_BIT_RATE, 5, hdl->video_bitrate_mode->val, hdl->video_bitrate->val, hdl->video_bitrate_peak->val / 400, 0, 0); if (err) return err; v4l2_ctrl_activate(hdl->video_bitrate_mode, hdl->video_encoding->val != V4L2_MPEG_VIDEO_ENCODING_MPEG_1); v4l2_ctrl_activate(hdl->video_bitrate_peak, hdl->video_bitrate_mode->val != V4L2_MPEG_VIDEO_BITRATE_MODE_CBR); if (cx2341x_neq(hdl->video_encoding) && hdl->ops && hdl->ops->s_video_encoding) return hdl->ops->s_video_encoding(hdl, hdl->video_encoding->val); return 0; case V4L2_CID_MPEG_VIDEO_MUTE: /* video mute cluster */ return cx2341x_hdl_api(hdl, CX2341X_ENC_MUTE_VIDEO, 1, hdl->video_mute->val | (hdl->video_mute_yuv->val << 8)); case V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE: { int active_filter; /* video filter mode */ err = cx2341x_hdl_api(hdl, CX2341X_ENC_SET_DNR_FILTER_MODE, 2, hdl->video_spatial_filter_mode->val | (hdl->video_temporal_filter_mode->val << 1), hdl->video_median_filter_type->val); if (err) return err; active_filter = hdl->video_spatial_filter_mode->val != V4L2_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE_AUTO; v4l2_ctrl_activate(hdl->video_spatial_filter, active_filter); v4l2_ctrl_activate(hdl->video_luma_spatial_filter_type, active_filter); v4l2_ctrl_activate(hdl->video_chroma_spatial_filter_type, active_filter); active_filter = hdl->video_temporal_filter_mode->val != V4L2_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE_AUTO; v4l2_ctrl_activate(hdl->video_temporal_filter, active_filter); active_filter = hdl->video_median_filter_type->val != V4L2_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE_OFF; v4l2_ctrl_activate(hdl->video_luma_median_filter_bottom, active_filter); v4l2_ctrl_activate(hdl->video_luma_median_filter_top, active_filter); v4l2_ctrl_activate(hdl->video_chroma_median_filter_bottom, active_filter); v4l2_ctrl_activate(hdl->video_chroma_median_filter_top, active_filter); return 0; } case V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE: /* video filter type cluster */ return cx2341x_hdl_api(hdl, CX2341X_ENC_SET_SPATIAL_FILTER_TYPE, 2, hdl->video_luma_spatial_filter_type->val, hdl->video_chroma_spatial_filter_type->val); case V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER: /* video filter cluster */ return cx2341x_hdl_api(hdl, CX2341X_ENC_SET_DNR_FILTER_PROPS, 2, hdl->video_spatial_filter->val, hdl->video_temporal_filter->val); case V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_MEDIAN_FILTER_TOP: /* video median cluster */ return cx2341x_hdl_api(hdl, CX2341X_ENC_SET_CORING_LEVELS, 4, hdl->video_luma_median_filter_bottom->val, hdl->video_luma_median_filter_top->val, hdl->video_chroma_median_filter_bottom->val, hdl->video_chroma_median_filter_top->val); } return -EINVAL; } static const struct v4l2_ctrl_ops cx2341x_ops = { .try_ctrl = cx2341x_try_ctrl, .s_ctrl = cx2341x_s_ctrl, }; static struct v4l2_ctrl *cx2341x_ctrl_new_custom(struct v4l2_ctrl_handler *hdl, u32 id, s32 min, s32 max, s32 step, s32 def) { struct v4l2_ctrl_config cfg; memset(&cfg, 0, sizeof(cfg)); cx2341x_ctrl_fill(id, &cfg.name, &cfg.type, &min, &max, &step, &def, &cfg.flags); cfg.ops = &cx2341x_ops; cfg.id = id; cfg.min = min; cfg.max = max; cfg.def = def; if (cfg.type == V4L2_CTRL_TYPE_MENU) { cfg.step = 0; cfg.menu_skip_mask = step; cfg.qmenu = cx2341x_get_menu(id); } else { cfg.step = step; cfg.menu_skip_mask = 0; } return v4l2_ctrl_new_custom(hdl, &cfg, NULL); } static struct v4l2_ctrl *cx2341x_ctrl_new_std(struct v4l2_ctrl_handler *hdl, u32 id, s32 min, s32 max, s32 step, s32 def) { return v4l2_ctrl_new_std(hdl, &cx2341x_ops, id, min, max, step, def); } static struct v4l2_ctrl *cx2341x_ctrl_new_menu(struct v4l2_ctrl_handler *hdl, u32 id, s32 max, s32 mask, s32 def) { return v4l2_ctrl_new_std_menu(hdl, &cx2341x_ops, id, max, mask, def); } int cx2341x_handler_init(struct cx2341x_handler *cxhdl, unsigned nr_of_controls_hint) { struct v4l2_ctrl_handler *hdl = &cxhdl->hdl; u32 caps = cxhdl->capabilities; int has_sliced_vbi = caps & CX2341X_CAP_HAS_SLICED_VBI; int has_ac3 = caps & CX2341X_CAP_HAS_AC3; int has_ts = caps & CX2341X_CAP_HAS_TS; cxhdl->width = 720; cxhdl->height = 480; v4l2_ctrl_handler_init(hdl, nr_of_controls_hint); /* Add controls in ascending control ID order for fastest insertion time. */ cxhdl->stream_type = cx2341x_ctrl_new_menu(hdl, V4L2_CID_MPEG_STREAM_TYPE, V4L2_MPEG_STREAM_TYPE_MPEG2_SVCD, has_ts ? 0 : 2, V4L2_MPEG_STREAM_TYPE_MPEG2_PS); cxhdl->stream_vbi_fmt = cx2341x_ctrl_new_menu(hdl, V4L2_CID_MPEG_STREAM_VBI_FMT, V4L2_MPEG_STREAM_VBI_FMT_IVTV, has_sliced_vbi ? 0 : 2, V4L2_MPEG_STREAM_VBI_FMT_NONE); cxhdl->audio_sampling_freq = cx2341x_ctrl_new_menu(hdl, V4L2_CID_MPEG_AUDIO_SAMPLING_FREQ, V4L2_MPEG_AUDIO_SAMPLING_FREQ_32000, 0, V4L2_MPEG_AUDIO_SAMPLING_FREQ_48000); cxhdl->audio_encoding = cx2341x_ctrl_new_menu(hdl, V4L2_CID_MPEG_AUDIO_ENCODING, V4L2_MPEG_AUDIO_ENCODING_AC3, has_ac3 ? ~0x12 : ~0x2, V4L2_MPEG_AUDIO_ENCODING_LAYER_2); cxhdl->audio_l2_bitrate = cx2341x_ctrl_new_menu(hdl, V4L2_CID_MPEG_AUDIO_L2_BITRATE, V4L2_MPEG_AUDIO_L2_BITRATE_384K, 0x1ff, V4L2_MPEG_AUDIO_L2_BITRATE_224K); cxhdl->audio_mode = cx2341x_ctrl_new_menu(hdl, V4L2_CID_MPEG_AUDIO_MODE, V4L2_MPEG_AUDIO_MODE_MONO, 0, V4L2_MPEG_AUDIO_MODE_STEREO); cxhdl->audio_mode_extension = cx2341x_ctrl_new_menu(hdl, V4L2_CID_MPEG_AUDIO_MODE_EXTENSION, V4L2_MPEG_AUDIO_MODE_EXTENSION_BOUND_16, 0, V4L2_MPEG_AUDIO_MODE_EXTENSION_BOUND_4); cxhdl->audio_emphasis = cx2341x_ctrl_new_menu(hdl, V4L2_CID_MPEG_AUDIO_EMPHASIS, V4L2_MPEG_AUDIO_EMPHASIS_CCITT_J17, 0, V4L2_MPEG_AUDIO_EMPHASIS_NONE); cxhdl->audio_crc = cx2341x_ctrl_new_menu(hdl, V4L2_CID_MPEG_AUDIO_CRC, V4L2_MPEG_AUDIO_CRC_CRC16, 0, V4L2_MPEG_AUDIO_CRC_NONE); cx2341x_ctrl_new_std(hdl, V4L2_CID_MPEG_AUDIO_MUTE, 0, 1, 1, 0); if (has_ac3) cxhdl->audio_ac3_bitrate = cx2341x_ctrl_new_menu(hdl, V4L2_CID_MPEG_AUDIO_AC3_BITRATE, V4L2_MPEG_AUDIO_AC3_BITRATE_448K, 0x03, V4L2_MPEG_AUDIO_AC3_BITRATE_224K); cxhdl->video_encoding = cx2341x_ctrl_new_menu(hdl, V4L2_CID_MPEG_VIDEO_ENCODING, V4L2_MPEG_VIDEO_ENCODING_MPEG_2, 0, V4L2_MPEG_VIDEO_ENCODING_MPEG_2); cx2341x_ctrl_new_menu(hdl, V4L2_CID_MPEG_VIDEO_ASPECT, V4L2_MPEG_VIDEO_ASPECT_221x100, 0, V4L2_MPEG_VIDEO_ASPECT_4x3); cxhdl->video_b_frames = cx2341x_ctrl_new_std(hdl, V4L2_CID_MPEG_VIDEO_B_FRAMES, 0, 33, 1, 2); cxhdl->video_gop_size = cx2341x_ctrl_new_std(hdl, V4L2_CID_MPEG_VIDEO_GOP_SIZE, 1, 34, 1, cxhdl->is_50hz ? 12 : 15); cx2341x_ctrl_new_std(hdl, V4L2_CID_MPEG_VIDEO_GOP_CLOSURE, 0, 1, 1, 1); cxhdl->video_bitrate_mode = cx2341x_ctrl_new_menu(hdl, V4L2_CID_MPEG_VIDEO_BITRATE_MODE, V4L2_MPEG_VIDEO_BITRATE_MODE_CBR, 0, V4L2_MPEG_VIDEO_BITRATE_MODE_VBR); cxhdl->video_bitrate = cx2341x_ctrl_new_std(hdl, V4L2_CID_MPEG_VIDEO_BITRATE, 0, 27000000, 1, 6000000); cxhdl->video_bitrate_peak = cx2341x_ctrl_new_std(hdl, V4L2_CID_MPEG_VIDEO_BITRATE_PEAK, 0, 27000000, 1, 8000000); cx2341x_ctrl_new_std(hdl, V4L2_CID_MPEG_VIDEO_TEMPORAL_DECIMATION, 0, 255, 1, 0); cxhdl->video_mute = cx2341x_ctrl_new_std(hdl, V4L2_CID_MPEG_VIDEO_MUTE, 0, 1, 1, 0); cxhdl->video_mute_yuv = cx2341x_ctrl_new_std(hdl, V4L2_CID_MPEG_VIDEO_MUTE_YUV, 0, 0xffffff, 1, 0x008080); /* CX23415/6 specific */ cxhdl->video_spatial_filter_mode = cx2341x_ctrl_new_custom(hdl, V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE, V4L2_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE_MANUAL, V4L2_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE_AUTO, 0, V4L2_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE_MANUAL); cxhdl->video_spatial_filter = cx2341x_ctrl_new_custom(hdl, V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER, 0, 15, 1, 0); cxhdl->video_luma_spatial_filter_type = cx2341x_ctrl_new_custom(hdl, V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE, V4L2_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE_OFF, V4L2_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE_2D_SYM_NON_SEPARABLE, 0, V4L2_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE_1D_HOR); cxhdl->video_chroma_spatial_filter_type = cx2341x_ctrl_new_custom(hdl, V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_SPATIAL_FILTER_TYPE, V4L2_MPEG_CX2341X_VIDEO_CHROMA_SPATIAL_FILTER_TYPE_OFF, V4L2_MPEG_CX2341X_VIDEO_CHROMA_SPATIAL_FILTER_TYPE_1D_HOR, 0, V4L2_MPEG_CX2341X_VIDEO_CHROMA_SPATIAL_FILTER_TYPE_1D_HOR); cxhdl->video_temporal_filter_mode = cx2341x_ctrl_new_custom(hdl, V4L2_CID_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE, V4L2_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE_MANUAL, V4L2_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE_AUTO, 0, V4L2_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE_MANUAL); cxhdl->video_temporal_filter = cx2341x_ctrl_new_custom(hdl, V4L2_CID_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER, 0, 31, 1, 8); cxhdl->video_median_filter_type = cx2341x_ctrl_new_custom(hdl, V4L2_CID_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE, V4L2_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE_OFF, V4L2_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE_DIAG, 0, V4L2_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE_OFF); cxhdl->video_luma_median_filter_bottom = cx2341x_ctrl_new_custom(hdl, V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_MEDIAN_FILTER_BOTTOM, 0, 255, 1, 0); cxhdl->video_luma_median_filter_top = cx2341x_ctrl_new_custom(hdl, V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_MEDIAN_FILTER_TOP, 0, 255, 1, 255); cxhdl->video_chroma_median_filter_bottom = cx2341x_ctrl_new_custom(hdl, V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_MEDIAN_FILTER_BOTTOM, 0, 255, 1, 0); cxhdl->video_chroma_median_filter_top = cx2341x_ctrl_new_custom(hdl, V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_MEDIAN_FILTER_TOP, 0, 255, 1, 255); cx2341x_ctrl_new_custom(hdl, V4L2_CID_MPEG_CX2341X_STREAM_INSERT_NAV_PACKETS, 0, 1, 1, 0); if (hdl->error) { int err = hdl->error; v4l2_ctrl_handler_free(hdl); return err; } v4l2_ctrl_cluster(8, &cxhdl->audio_sampling_freq); v4l2_ctrl_cluster(2, &cxhdl->video_b_frames); v4l2_ctrl_cluster(5, &cxhdl->stream_type); v4l2_ctrl_cluster(2, &cxhdl->video_mute); v4l2_ctrl_cluster(3, &cxhdl->video_spatial_filter_mode); v4l2_ctrl_cluster(2, &cxhdl->video_luma_spatial_filter_type); v4l2_ctrl_cluster(2, &cxhdl->video_spatial_filter); v4l2_ctrl_cluster(4, &cxhdl->video_luma_median_filter_top); return 0; } EXPORT_SYMBOL(cx2341x_handler_init); void cx2341x_handler_set_50hz(struct cx2341x_handler *cxhdl, int is_50hz) { cxhdl->is_50hz = is_50hz; cxhdl->video_gop_size->default_value = cxhdl->is_50hz ? 12 : 15; } EXPORT_SYMBOL(cx2341x_handler_set_50hz); int cx2341x_handler_setup(struct cx2341x_handler *cxhdl) { int h = cxhdl->height; int w = cxhdl->width; int err; err = cx2341x_hdl_api(cxhdl, CX2341X_ENC_SET_OUTPUT_PORT, 2, cxhdl->port, 0); if (err) return err; err = cx2341x_hdl_api(cxhdl, CX2341X_ENC_SET_FRAME_RATE, 1, cxhdl->is_50hz); if (err) return err; if (v4l2_ctrl_g_ctrl(cxhdl->video_encoding) == V4L2_MPEG_VIDEO_ENCODING_MPEG_1) { w /= 2; h /= 2; } err = cx2341x_hdl_api(cxhdl, CX2341X_ENC_SET_FRAME_SIZE, 2, h, w); if (err) return err; return v4l2_ctrl_handler_setup(&cxhdl->hdl); } EXPORT_SYMBOL(cx2341x_handler_setup); void cx2341x_handler_set_busy(struct cx2341x_handler *cxhdl, int busy) { v4l2_ctrl_grab(cxhdl->audio_sampling_freq, busy); v4l2_ctrl_grab(cxhdl->audio_encoding, busy); v4l2_ctrl_grab(cxhdl->audio_l2_bitrate, busy); v4l2_ctrl_grab(cxhdl->audio_ac3_bitrate, busy); v4l2_ctrl_grab(cxhdl->stream_vbi_fmt, busy); v4l2_ctrl_grab(cxhdl->stream_type, busy); v4l2_ctrl_grab(cxhdl->video_bitrate_mode, busy); v4l2_ctrl_grab(cxhdl->video_bitrate, busy); v4l2_ctrl_grab(cxhdl->video_bitrate_peak, busy); } EXPORT_SYMBOL(cx2341x_handler_set_busy); |
1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BLOCKGROUP_LOCK_H #define _LINUX_BLOCKGROUP_LOCK_H /* * Per-blockgroup locking for ext2 and ext3. * * Simple hashed spinlocking. */ #include <linux/spinlock.h> #include <linux/cache.h> #ifdef CONFIG_SMP #define NR_BG_LOCKS (4 << ilog2(NR_CPUS < 32 ? NR_CPUS : 32)) #else #define NR_BG_LOCKS 1 #endif struct bgl_lock { spinlock_t lock; } ____cacheline_aligned_in_smp; struct blockgroup_lock { struct bgl_lock locks[NR_BG_LOCKS]; }; static inline void bgl_lock_init(struct blockgroup_lock *bgl) { int i; for (i = 0; i < NR_BG_LOCKS; i++) spin_lock_init(&bgl->locks[i].lock); } static inline spinlock_t * bgl_lock_ptr(struct blockgroup_lock *bgl, unsigned int block_group) { return &bgl->locks[block_group & (NR_BG_LOCKS-1)].lock; } #endif |
25 1 8 9 9 9 1 10 7 6 2 4 5 1 1 4 3 2 1 1 1 2 1 2 2 2 2 2 2 1 1 5 5 5 2 4 2 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/dir.c * * Copyright (C) 2001 * Brad Boyer (flar@allandria.com) * (C) 2003 Ardis Technologies <roman@ardistech.com> * * Handling of directories */ #include <linux/errno.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/random.h> #include <linux/nls.h> #include "hfsplus_fs.h" #include "hfsplus_raw.h" #include "xattr.h" static inline void hfsplus_instantiate(struct dentry *dentry, struct inode *inode, u32 cnid) { dentry->d_fsdata = (void *)(unsigned long)cnid; d_instantiate(dentry, inode); } /* Find the entry inside dir named dentry->d_name */ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode = NULL; struct hfs_find_data fd; struct super_block *sb; hfsplus_cat_entry entry; int err; u32 cnid, linkid = 0; u16 type; sb = dir->i_sb; dentry->d_fsdata = NULL; err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); if (err) return ERR_PTR(err); err = hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name); if (unlikely(err < 0)) goto fail; again: err = hfs_brec_read(&fd, &entry, sizeof(entry)); if (err) { if (err == -ENOENT) { hfs_find_exit(&fd); /* No such entry */ inode = NULL; goto out; } goto fail; } type = be16_to_cpu(entry.type); if (type == HFSPLUS_FOLDER) { if (fd.entrylength < sizeof(struct hfsplus_cat_folder)) { err = -EIO; goto fail; } cnid = be32_to_cpu(entry.folder.id); dentry->d_fsdata = (void *)(unsigned long)cnid; } else if (type == HFSPLUS_FILE) { if (fd.entrylength < sizeof(struct hfsplus_cat_file)) { err = -EIO; goto fail; } cnid = be32_to_cpu(entry.file.id); if (entry.file.user_info.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) && entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) && HFSPLUS_SB(sb)->hidden_dir && (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)-> create_date || entry.file.create_date == HFSPLUS_I(d_inode(sb->s_root))-> create_date)) { struct qstr str; char name[32]; if (dentry->d_fsdata) { /* * We found a link pointing to another link, * so ignore it and treat it as regular file. */ cnid = (unsigned long)dentry->d_fsdata; linkid = 0; } else { dentry->d_fsdata = (void *)(unsigned long)cnid; linkid = be32_to_cpu(entry.file.permissions.dev); str.len = sprintf(name, "iNode%d", linkid); str.name = name; err = hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_SB(sb)->hidden_dir->i_ino, &str); if (unlikely(err < 0)) goto fail; goto again; } } else if (!dentry->d_fsdata) dentry->d_fsdata = (void *)(unsigned long)cnid; } else { pr_err("invalid catalog entry type in lookup\n"); err = -EIO; goto fail; } hfs_find_exit(&fd); inode = hfsplus_iget(dir->i_sb, cnid); if (IS_ERR(inode)) return ERR_CAST(inode); if (S_ISREG(inode->i_mode)) HFSPLUS_I(inode)->linkid = linkid; out: return d_splice_alias(inode, dentry); fail: hfs_find_exit(&fd); return ERR_PTR(err); } static int hfsplus_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; int len, err; char *strbuf; hfsplus_cat_entry entry; struct hfs_find_data fd; struct hfsplus_readdir_data *rd; u16 type; if (file->f_pos >= inode->i_size) return 0; err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); if (err) return err; strbuf = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_MAX_STRLEN + 1, GFP_KERNEL); if (!strbuf) { err = -ENOMEM; goto out; } hfsplus_cat_build_key_with_cnid(sb, fd.search_key, inode->i_ino); err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err) goto out; if (ctx->pos == 0) { /* This is completely artificial... */ if (!dir_emit_dot(file, ctx)) goto out; ctx->pos = 1; } if (ctx->pos == 1) { if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { err = -EIO; goto out; } hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); if (be16_to_cpu(entry.type) != HFSPLUS_FOLDER_THREAD) { pr_err("bad catalog folder thread\n"); err = -EIO; goto out; } if (fd.entrylength < HFSPLUS_MIN_THREAD_SZ) { pr_err("truncated catalog thread\n"); err = -EIO; goto out; } if (!dir_emit(ctx, "..", 2, be32_to_cpu(entry.thread.parentID), DT_DIR)) goto out; ctx->pos = 2; } if (ctx->pos >= inode->i_size) goto out; err = hfs_brec_goto(&fd, ctx->pos - 1); if (err) goto out; for (;;) { if (be32_to_cpu(fd.key->cat.parent) != inode->i_ino) { pr_err("walked past end of dir\n"); err = -EIO; goto out; } if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { err = -EIO; goto out; } hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); type = be16_to_cpu(entry.type); len = NLS_MAX_CHARSET_SIZE * HFSPLUS_MAX_STRLEN; err = hfsplus_uni2asc(sb, &fd.key->cat.name, strbuf, &len); if (err) goto out; if (type == HFSPLUS_FOLDER) { if (fd.entrylength < sizeof(struct hfsplus_cat_folder)) { pr_err("small dir entry\n"); err = -EIO; goto out; } if (HFSPLUS_SB(sb)->hidden_dir && HFSPLUS_SB(sb)->hidden_dir->i_ino == be32_to_cpu(entry.folder.id)) goto next; if (!dir_emit(ctx, strbuf, len, be32_to_cpu(entry.folder.id), DT_DIR)) break; } else if (type == HFSPLUS_FILE) { u16 mode; unsigned type = DT_UNKNOWN; if (fd.entrylength < sizeof(struct hfsplus_cat_file)) { pr_err("small file entry\n"); err = -EIO; goto out; } mode = be16_to_cpu(entry.file.permissions.mode); if (S_ISREG(mode)) type = DT_REG; else if (S_ISLNK(mode)) type = DT_LNK; else if (S_ISFIFO(mode)) type = DT_FIFO; else if (S_ISCHR(mode)) type = DT_CHR; else if (S_ISBLK(mode)) type = DT_BLK; else if (S_ISSOCK(mode)) type = DT_SOCK; if (!dir_emit(ctx, strbuf, len, be32_to_cpu(entry.file.id), type)) break; } else { pr_err("bad catalog entry type\n"); err = -EIO; goto out; } next: ctx->pos++; if (ctx->pos >= inode->i_size) goto out; err = hfs_brec_goto(&fd, 1); if (err) goto out; } rd = file->private_data; if (!rd) { rd = kmalloc(sizeof(struct hfsplus_readdir_data), GFP_KERNEL); if (!rd) { err = -ENOMEM; goto out; } file->private_data = rd; rd->file = file; spin_lock(&HFSPLUS_I(inode)->open_dir_lock); list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list); spin_unlock(&HFSPLUS_I(inode)->open_dir_lock); } /* * Can be done after the list insertion; exclusion with * hfsplus_delete_cat() is provided by directory lock. */ memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key)); out: kfree(strbuf); hfs_find_exit(&fd); return err; } static int hfsplus_dir_release(struct inode *inode, struct file *file) { struct hfsplus_readdir_data *rd = file->private_data; if (rd) { spin_lock(&HFSPLUS_I(inode)->open_dir_lock); list_del(&rd->list); spin_unlock(&HFSPLUS_I(inode)->open_dir_lock); kfree(rd); } return 0; } static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, struct dentry *dst_dentry) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(dst_dir->i_sb); struct inode *inode = d_inode(src_dentry); struct inode *src_dir = d_inode(src_dentry->d_parent); struct qstr str; char name[32]; u32 cnid, id; int res; if (HFSPLUS_IS_RSRC(inode)) return -EPERM; if (!S_ISREG(inode->i_mode)) return -EPERM; mutex_lock(&sbi->vh_mutex); if (inode->i_ino == (u32)(unsigned long)src_dentry->d_fsdata) { for (;;) { get_random_bytes(&id, sizeof(cnid)); id &= 0x3fffffff; str.name = name; str.len = sprintf(name, "iNode%d", id); res = hfsplus_rename_cat(inode->i_ino, src_dir, &src_dentry->d_name, sbi->hidden_dir, &str); if (!res) break; if (res != -EEXIST) goto out; } HFSPLUS_I(inode)->linkid = id; cnid = sbi->next_cnid++; src_dentry->d_fsdata = (void *)(unsigned long)cnid; res = hfsplus_create_cat(cnid, src_dir, &src_dentry->d_name, inode); if (res) /* panic? */ goto out; sbi->file_count++; } cnid = sbi->next_cnid++; res = hfsplus_create_cat(cnid, dst_dir, &dst_dentry->d_name, inode); if (res) goto out; inc_nlink(inode); hfsplus_instantiate(dst_dentry, inode, cnid); ihold(inode); inode_set_ctime_current(inode); mark_inode_dirty(inode); sbi->file_count++; hfsplus_mark_mdb_dirty(dst_dir->i_sb); out: mutex_unlock(&sbi->vh_mutex); return res; } static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); struct inode *inode = d_inode(dentry); struct qstr str; char name[32]; u32 cnid; int res; if (HFSPLUS_IS_RSRC(inode)) return -EPERM; mutex_lock(&sbi->vh_mutex); cnid = (u32)(unsigned long)dentry->d_fsdata; if (inode->i_ino == cnid && atomic_read(&HFSPLUS_I(inode)->opencnt)) { str.name = name; str.len = sprintf(name, "temp%lu", inode->i_ino); res = hfsplus_rename_cat(inode->i_ino, dir, &dentry->d_name, sbi->hidden_dir, &str); if (!res) { inode->i_flags |= S_DEAD; drop_nlink(inode); } goto out; } res = hfsplus_delete_cat(cnid, dir, &dentry->d_name); if (res) goto out; if (inode->i_nlink > 0) drop_nlink(inode); if (inode->i_ino == cnid) clear_nlink(inode); if (!inode->i_nlink) { if (inode->i_ino != cnid) { sbi->file_count--; if (!atomic_read(&HFSPLUS_I(inode)->opencnt)) { res = hfsplus_delete_cat(inode->i_ino, sbi->hidden_dir, NULL); if (!res) hfsplus_delete_inode(inode); } else inode->i_flags |= S_DEAD; } else hfsplus_delete_inode(inode); } else sbi->file_count--; inode_set_ctime_current(inode); mark_inode_dirty(inode); out: mutex_unlock(&sbi->vh_mutex); return res; } static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); struct inode *inode = d_inode(dentry); int res; if (inode->i_size != 2) return -ENOTEMPTY; mutex_lock(&sbi->vh_mutex); res = hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name); if (res) goto out; clear_nlink(inode); inode_set_ctime_current(inode); hfsplus_delete_inode(inode); mark_inode_dirty(inode); out: mutex_unlock(&sbi->vh_mutex); return res; } static int hfsplus_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); struct inode *inode; int res = -ENOMEM; mutex_lock(&sbi->vh_mutex); inode = hfsplus_new_inode(dir->i_sb, dir, S_IFLNK | S_IRWXUGO); if (!inode) goto out; res = page_symlink(inode, symname, strlen(symname) + 1); if (res) goto out_err; res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); if (res) goto out_err; res = hfsplus_init_security(inode, dir, &dentry->d_name); if (res == -EOPNOTSUPP) res = 0; /* Operation is not supported. */ else if (res) { /* Try to delete anyway without error analysis. */ hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name); goto out_err; } hfsplus_instantiate(dentry, inode, inode->i_ino); mark_inode_dirty(inode); goto out; out_err: clear_nlink(inode); hfsplus_delete_inode(inode); iput(inode); out: mutex_unlock(&sbi->vh_mutex); return res; } static int hfsplus_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); struct inode *inode; int res = -ENOMEM; mutex_lock(&sbi->vh_mutex); inode = hfsplus_new_inode(dir->i_sb, dir, mode); if (!inode) goto out; if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) init_special_inode(inode, mode, rdev); res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); if (res) goto failed_mknod; res = hfsplus_init_security(inode, dir, &dentry->d_name); if (res == -EOPNOTSUPP) res = 0; /* Operation is not supported. */ else if (res) { /* Try to delete anyway without error analysis. */ hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name); goto failed_mknod; } hfsplus_instantiate(dentry, inode, inode->i_ino); mark_inode_dirty(inode); goto out; failed_mknod: clear_nlink(inode); hfsplus_delete_inode(inode); iput(inode); out: mutex_unlock(&sbi->vh_mutex); return res; } static int hfsplus_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return hfsplus_mknod(&nop_mnt_idmap, dir, dentry, mode, 0); } static int hfsplus_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { return hfsplus_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0); } static int hfsplus_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { int res; if (flags & ~RENAME_NOREPLACE) return -EINVAL; /* Unlink destination if it already exists */ if (d_really_is_positive(new_dentry)) { if (d_is_dir(new_dentry)) res = hfsplus_rmdir(new_dir, new_dentry); else res = hfsplus_unlink(new_dir, new_dentry); if (res) return res; } res = hfsplus_rename_cat((u32)(unsigned long)old_dentry->d_fsdata, old_dir, &old_dentry->d_name, new_dir, &new_dentry->d_name); if (!res) new_dentry->d_fsdata = old_dentry->d_fsdata; return res; } const struct inode_operations hfsplus_dir_inode_operations = { .lookup = hfsplus_lookup, .create = hfsplus_create, .link = hfsplus_link, .unlink = hfsplus_unlink, .mkdir = hfsplus_mkdir, .rmdir = hfsplus_rmdir, .symlink = hfsplus_symlink, .mknod = hfsplus_mknod, .rename = hfsplus_rename, .getattr = hfsplus_getattr, .listxattr = hfsplus_listxattr, .fileattr_get = hfsplus_fileattr_get, .fileattr_set = hfsplus_fileattr_set, }; const struct file_operations hfsplus_dir_operations = { .fsync = hfsplus_file_fsync, .read = generic_read_dir, .iterate_shared = hfsplus_readdir, .unlocked_ioctl = hfsplus_ioctl, .llseek = generic_file_llseek, .release = hfsplus_dir_release, }; |
2 1 1 6 6 6 6 6 6 6 6 6 6 6 4 9 7 9 16 16 9 7 16 19 1 1 1 16 16 7 9 7 23 16 4 1 6 2 3 1 4 20 20 4 16 6 6 6 20 20 6 26 27 32 29 4 6 2 1 1 3 16 2 1 1 1 2 27 27 27 27 6 27 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2020, Microsoft Corporation. * * Author(s): Steve French <stfrench@microsoft.com> * David Howells <dhowells@redhat.com> */ /* #include <linux/module.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/magic.h> #include <linux/security.h> #include <net/net_namespace.h> #ifdef CONFIG_CIFS_DFS_UPCALL #include "dfs_cache.h" #endif */ #include <linux/ctype.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/parser.h> #include <linux/utsname.h> #include "cifsfs.h" #include "cifspdu.h" #include "cifsglob.h" #include "cifsproto.h" #include "cifs_unicode.h" #include "cifs_debug.h" #include "cifs_fs_sb.h" #include "ntlmssp.h" #include "nterr.h" #include "rfc1002pdu.h" #include "fs_context.h" static DEFINE_MUTEX(cifs_mount_mutex); static const match_table_t cifs_smb_version_tokens = { { Smb_1, SMB1_VERSION_STRING }, { Smb_20, SMB20_VERSION_STRING}, { Smb_21, SMB21_VERSION_STRING }, { Smb_30, SMB30_VERSION_STRING }, { Smb_302, SMB302_VERSION_STRING }, { Smb_302, ALT_SMB302_VERSION_STRING }, { Smb_311, SMB311_VERSION_STRING }, { Smb_311, ALT_SMB311_VERSION_STRING }, { Smb_3any, SMB3ANY_VERSION_STRING }, { Smb_default, SMBDEFAULT_VERSION_STRING }, { Smb_version_err, NULL } }; static const match_table_t cifs_secflavor_tokens = { { Opt_sec_krb5, "krb5" }, { Opt_sec_krb5i, "krb5i" }, { Opt_sec_krb5p, "krb5p" }, { Opt_sec_ntlmsspi, "ntlmsspi" }, { Opt_sec_ntlmssp, "ntlmssp" }, { Opt_sec_ntlmv2, "nontlm" }, { Opt_sec_ntlmv2, "ntlmv2" }, { Opt_sec_ntlmv2i, "ntlmv2i" }, { Opt_sec_none, "none" }, { Opt_sec_err, NULL } }; const struct fs_parameter_spec smb3_fs_parameters[] = { /* Mount options that take no arguments */ fsparam_flag_no("user_xattr", Opt_user_xattr), fsparam_flag_no("forceuid", Opt_forceuid), fsparam_flag_no("multichannel", Opt_multichannel), fsparam_flag_no("forcegid", Opt_forcegid), fsparam_flag("noblocksend", Opt_noblocksend), fsparam_flag("noautotune", Opt_noautotune), fsparam_flag("nolease", Opt_nolease), fsparam_flag_no("hard", Opt_hard), fsparam_flag_no("soft", Opt_soft), fsparam_flag_no("perm", Opt_perm), fsparam_flag("nodelete", Opt_nodelete), fsparam_flag_no("mapposix", Opt_mapposix), fsparam_flag("mapchars", Opt_mapchars), fsparam_flag("nomapchars", Opt_nomapchars), fsparam_flag_no("sfu", Opt_sfu), fsparam_flag("nodfs", Opt_nodfs), fsparam_flag_no("posixpaths", Opt_posixpaths), fsparam_flag_no("unix", Opt_unix), fsparam_flag_no("linux", Opt_unix), fsparam_flag_no("posix", Opt_unix), fsparam_flag("nocase", Opt_nocase), fsparam_flag("ignorecase", Opt_nocase), fsparam_flag_no("brl", Opt_brl), fsparam_flag_no("handlecache", Opt_handlecache), fsparam_flag("forcemandatorylock", Opt_forcemandatorylock), fsparam_flag("forcemand", Opt_forcemandatorylock), fsparam_flag("setuidfromacl", Opt_setuidfromacl), fsparam_flag("idsfromsid", Opt_setuidfromacl), fsparam_flag_no("setuids", Opt_setuids), fsparam_flag_no("dynperm", Opt_dynperm), fsparam_flag_no("intr", Opt_intr), fsparam_flag_no("strictsync", Opt_strictsync), fsparam_flag_no("serverino", Opt_serverino), fsparam_flag("rwpidforward", Opt_rwpidforward), fsparam_flag("cifsacl", Opt_cifsacl), fsparam_flag_no("acl", Opt_acl), fsparam_flag("locallease", Opt_locallease), fsparam_flag("sign", Opt_sign), fsparam_flag("ignore_signature", Opt_ignore_signature), fsparam_flag("signloosely", Opt_ignore_signature), fsparam_flag("seal", Opt_seal), fsparam_flag("noac", Opt_noac), fsparam_flag("fsc", Opt_fsc), fsparam_flag("mfsymlinks", Opt_mfsymlinks), fsparam_flag("multiuser", Opt_multiuser), fsparam_flag("sloppy", Opt_sloppy), fsparam_flag("nosharesock", Opt_nosharesock), fsparam_flag_no("persistenthandles", Opt_persistent), fsparam_flag_no("resilienthandles", Opt_resilient), fsparam_flag_no("tcpnodelay", Opt_tcp_nodelay), fsparam_flag("nosparse", Opt_nosparse), fsparam_flag("domainauto", Opt_domainauto), fsparam_flag("rdma", Opt_rdma), fsparam_flag("modesid", Opt_modesid), fsparam_flag("modefromsid", Opt_modesid), fsparam_flag("rootfs", Opt_rootfs), fsparam_flag("compress", Opt_compress), fsparam_flag("witness", Opt_witness), /* Mount options which take numeric value */ fsparam_u32("backupuid", Opt_backupuid), fsparam_u32("backupgid", Opt_backupgid), fsparam_u32("uid", Opt_uid), fsparam_u32("cruid", Opt_cruid), fsparam_u32("gid", Opt_gid), fsparam_u32("file_mode", Opt_file_mode), fsparam_u32("dirmode", Opt_dirmode), fsparam_u32("dir_mode", Opt_dirmode), fsparam_u32("port", Opt_port), fsparam_u32("min_enc_offload", Opt_min_enc_offload), fsparam_u32("retrans", Opt_retrans), fsparam_u32("esize", Opt_min_enc_offload), fsparam_u32("bsize", Opt_blocksize), fsparam_u32("rasize", Opt_rasize), fsparam_u32("rsize", Opt_rsize), fsparam_u32("wsize", Opt_wsize), fsparam_u32("actimeo", Opt_actimeo), fsparam_u32("acdirmax", Opt_acdirmax), fsparam_u32("acregmax", Opt_acregmax), fsparam_u32("closetimeo", Opt_closetimeo), fsparam_u32("echo_interval", Opt_echo_interval), fsparam_u32("max_credits", Opt_max_credits), fsparam_u32("max_cached_dirs", Opt_max_cached_dirs), fsparam_u32("handletimeout", Opt_handletimeout), fsparam_u64("snapshot", Opt_snapshot), fsparam_u32("max_channels", Opt_max_channels), /* Mount options which take string value */ fsparam_string("source", Opt_source), fsparam_string("user", Opt_user), fsparam_string("username", Opt_user), fsparam_string("pass", Opt_pass), fsparam_string("password", Opt_pass), fsparam_string("ip", Opt_ip), fsparam_string("addr", Opt_ip), fsparam_string("domain", Opt_domain), fsparam_string("dom", Opt_domain), fsparam_string("srcaddr", Opt_srcaddr), fsparam_string("iocharset", Opt_iocharset), fsparam_string("netbiosname", Opt_netbiosname), fsparam_string("servern", Opt_servern), fsparam_string("ver", Opt_ver), fsparam_string("vers", Opt_vers), fsparam_string("sec", Opt_sec), fsparam_string("cache", Opt_cache), fsparam_string("reparse", Opt_reparse), /* Arguments that should be ignored */ fsparam_flag("guest", Opt_ignore), fsparam_flag("noatime", Opt_ignore), fsparam_flag("relatime", Opt_ignore), fsparam_flag("_netdev", Opt_ignore), fsparam_flag_no("suid", Opt_ignore), fsparam_flag_no("exec", Opt_ignore), fsparam_flag_no("dev", Opt_ignore), fsparam_flag_no("mand", Opt_ignore), fsparam_flag_no("auto", Opt_ignore), fsparam_string("cred", Opt_ignore), fsparam_string("credentials", Opt_ignore), /* * UNC and prefixpath is now extracted from Opt_source * in the new mount API so we can just ignore them going forward. */ fsparam_string("unc", Opt_ignore), fsparam_string("prefixpath", Opt_ignore), {} }; static int cifs_parse_security_flavors(struct fs_context *fc, char *value, struct smb3_fs_context *ctx) { substring_t args[MAX_OPT_ARGS]; /* * With mount options, the last one should win. Reset any existing * settings back to default. */ ctx->sectype = Unspecified; ctx->sign = false; switch (match_token(value, cifs_secflavor_tokens, args)) { case Opt_sec_krb5p: cifs_errorf(fc, "sec=krb5p is not supported. Use sec=krb5,seal instead\n"); return 1; case Opt_sec_krb5i: ctx->sign = true; fallthrough; case Opt_sec_krb5: ctx->sectype = Kerberos; break; case Opt_sec_ntlmsspi: ctx->sign = true; fallthrough; case Opt_sec_ntlmssp: ctx->sectype = RawNTLMSSP; break; case Opt_sec_ntlmv2i: ctx->sign = true; fallthrough; case Opt_sec_ntlmv2: ctx->sectype = NTLMv2; break; case Opt_sec_none: ctx->nullauth = 1; kfree(ctx->username); ctx->username = NULL; break; default: cifs_errorf(fc, "bad security option: %s\n", value); return 1; } return 0; } static const match_table_t cifs_cacheflavor_tokens = { { Opt_cache_loose, "loose" }, { Opt_cache_strict, "strict" }, { Opt_cache_none, "none" }, { Opt_cache_ro, "ro" }, { Opt_cache_rw, "singleclient" }, { Opt_cache_err, NULL } }; static int cifs_parse_cache_flavor(struct fs_context *fc, char *value, struct smb3_fs_context *ctx) { substring_t args[MAX_OPT_ARGS]; switch (match_token(value, cifs_cacheflavor_tokens, args)) { case Opt_cache_loose: ctx->direct_io = false; ctx->strict_io = false; ctx->cache_ro = false; ctx->cache_rw = false; break; case Opt_cache_strict: ctx->direct_io = false; ctx->strict_io = true; ctx->cache_ro = false; ctx->cache_rw = false; break; case Opt_cache_none: ctx->direct_io = true; ctx->strict_io = false; ctx->cache_ro = false; ctx->cache_rw = false; break; case Opt_cache_ro: ctx->direct_io = false; ctx->strict_io = false; ctx->cache_ro = true; ctx->cache_rw = false; break; case Opt_cache_rw: ctx->direct_io = false; ctx->strict_io = false; ctx->cache_ro = false; ctx->cache_rw = true; break; default: cifs_errorf(fc, "bad cache= option: %s\n", value); return 1; } return 0; } static const match_table_t reparse_flavor_tokens = { { Opt_reparse_default, "default" }, { Opt_reparse_nfs, "nfs" }, { Opt_reparse_wsl, "wsl" }, { Opt_reparse_err, NULL }, }; static int parse_reparse_flavor(struct fs_context *fc, char *value, struct smb3_fs_context *ctx) { substring_t args[MAX_OPT_ARGS]; switch (match_token(value, reparse_flavor_tokens, args)) { case Opt_reparse_default: ctx->reparse_type = CIFS_REPARSE_TYPE_DEFAULT; break; case Opt_reparse_nfs: ctx->reparse_type = CIFS_REPARSE_TYPE_NFS; break; case Opt_reparse_wsl: ctx->reparse_type = CIFS_REPARSE_TYPE_WSL; break; default: cifs_errorf(fc, "bad reparse= option: %s\n", value); return 1; } return 0; } #define DUP_CTX_STR(field) \ do { \ if (ctx->field) { \ new_ctx->field = kstrdup(ctx->field, GFP_ATOMIC); \ if (new_ctx->field == NULL) { \ smb3_cleanup_fs_context_contents(new_ctx); \ return -ENOMEM; \ } \ } \ } while (0) int smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx) { memcpy(new_ctx, ctx, sizeof(*ctx)); new_ctx->prepath = NULL; new_ctx->nodename = NULL; new_ctx->username = NULL; new_ctx->password = NULL; new_ctx->server_hostname = NULL; new_ctx->domainname = NULL; new_ctx->UNC = NULL; new_ctx->source = NULL; new_ctx->iocharset = NULL; new_ctx->leaf_fullpath = NULL; /* * Make sure to stay in sync with smb3_cleanup_fs_context_contents() */ DUP_CTX_STR(prepath); DUP_CTX_STR(username); DUP_CTX_STR(password); DUP_CTX_STR(server_hostname); DUP_CTX_STR(UNC); DUP_CTX_STR(source); DUP_CTX_STR(domainname); DUP_CTX_STR(nodename); DUP_CTX_STR(iocharset); DUP_CTX_STR(leaf_fullpath); return 0; } static int cifs_parse_smb_version(struct fs_context *fc, char *value, struct smb3_fs_context *ctx, bool is_smb3) { substring_t args[MAX_OPT_ARGS]; switch (match_token(value, cifs_smb_version_tokens, args)) { #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY case Smb_1: if (disable_legacy_dialects) { cifs_errorf(fc, "mount with legacy dialect disabled\n"); return 1; } if (is_smb3) { cifs_errorf(fc, "vers=1.0 (cifs) not permitted when mounting with smb3\n"); return 1; } cifs_errorf(fc, "Use of the less secure dialect vers=1.0 is not recommended unless required for access to very old servers\n"); ctx->ops = &smb1_operations; ctx->vals = &smb1_values; break; case Smb_20: if (disable_legacy_dialects) { cifs_errorf(fc, "mount with legacy dialect disabled\n"); return 1; } if (is_smb3) { cifs_errorf(fc, "vers=2.0 not permitted when mounting with smb3\n"); return 1; } ctx->ops = &smb20_operations; ctx->vals = &smb20_values; break; #else case Smb_1: cifs_errorf(fc, "vers=1.0 (cifs) mount not permitted when legacy dialects disabled\n"); return 1; case Smb_20: cifs_errorf(fc, "vers=2.0 mount not permitted when legacy dialects disabled\n"); return 1; #endif /* CIFS_ALLOW_INSECURE_LEGACY */ case Smb_21: ctx->ops = &smb21_operations; ctx->vals = &smb21_values; break; case Smb_30: ctx->ops = &smb30_operations; ctx->vals = &smb30_values; break; case Smb_302: ctx->ops = &smb30_operations; /* currently identical with 3.0 */ ctx->vals = &smb302_values; break; case Smb_311: ctx->ops = &smb311_operations; ctx->vals = &smb311_values; break; case Smb_3any: ctx->ops = &smb30_operations; /* currently identical with 3.0 */ ctx->vals = &smb3any_values; break; case Smb_default: ctx->ops = &smb30_operations; ctx->vals = &smbdefault_values; break; default: cifs_errorf(fc, "Unknown vers= option specified: %s\n", value); return 1; } return 0; } int smb3_parse_opt(const char *options, const char *key, char **val) { int rc = -ENOENT; char *opts, *orig, *p; orig = opts = kstrdup(options, GFP_KERNEL); if (!opts) return -ENOMEM; while ((p = strsep(&opts, ","))) { char *nval; if (!*p) continue; if (strncasecmp(p, key, strlen(key))) continue; nval = strchr(p, '='); if (nval) { if (nval == p) continue; *nval++ = 0; *val = kstrdup(nval, GFP_KERNEL); rc = !*val ? -ENOMEM : 0; goto out; } } out: kfree(orig); return rc; } /* * Remove duplicate path delimiters. Windows is supposed to do that * but there are some bugs that prevent rename from working if there are * multiple delimiters. * * Return a sanitized duplicate of @path or NULL for empty prefix paths. * Otherwise, return ERR_PTR. * * @gfp indicates the GFP_* flags for kstrdup. * The caller is responsible for freeing the original. */ #define IS_DELIM(c) ((c) == '/' || (c) == '\\') char *cifs_sanitize_prepath(char *prepath, gfp_t gfp) { char *cursor1 = prepath, *cursor2 = prepath; char *s; /* skip all prepended delimiters */ while (IS_DELIM(*cursor1)) cursor1++; /* copy the first letter */ *cursor2 = *cursor1; /* copy the remainder... */ while (*(cursor1++)) { /* ... skipping all duplicated delimiters */ if (IS_DELIM(*cursor1) && IS_DELIM(*cursor2)) continue; *(++cursor2) = *cursor1; } /* if the last character is a delimiter, skip it */ if (IS_DELIM(*(cursor2 - 1))) cursor2--; *cursor2 = '\0'; if (!*prepath) return NULL; s = kstrdup(prepath, gfp); if (!s) return ERR_PTR(-ENOMEM); return s; } /* * Return full path based on the values of @ctx->{UNC,prepath}. * * It is assumed that both values were already parsed by smb3_parse_devname(). */ char *smb3_fs_context_fullpath(const struct smb3_fs_context *ctx, char dirsep) { size_t ulen, plen; char *s; ulen = strlen(ctx->UNC); plen = ctx->prepath ? strlen(ctx->prepath) + 1 : 0; s = kmalloc(ulen + plen + 1, GFP_KERNEL); if (!s) return ERR_PTR(-ENOMEM); memcpy(s, ctx->UNC, ulen); if (plen) { s[ulen] = dirsep; memcpy(s + ulen + 1, ctx->prepath, plen); } s[ulen + plen] = '\0'; convert_delimiter(s, dirsep); return s; } /* * Parse a devname into substrings and populate the ctx->UNC and ctx->prepath * fields with the result. Returns 0 on success and an error otherwise * (e.g. ENOMEM or EINVAL) */ int smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx) { char *pos; const char *delims = "/\\"; size_t len; int rc; if (unlikely(!devname || !*devname)) { cifs_dbg(VFS, "Device name not specified\n"); return -EINVAL; } /* make sure we have a valid UNC double delimiter prefix */ len = strspn(devname, delims); if (len != 2) return -EINVAL; /* find delimiter between host and sharename */ pos = strpbrk(devname + 2, delims); if (!pos) return -EINVAL; /* record the server hostname */ kfree(ctx->server_hostname); ctx->server_hostname = kstrndup(devname + 2, pos - devname - 2, GFP_KERNEL); if (!ctx->server_hostname) return -ENOMEM; /* skip past delimiter */ ++pos; /* now go until next delimiter or end of string */ len = strcspn(pos, delims); if (!len) return -EINVAL; /* move "pos" up to delimiter or NULL */ pos += len; kfree(ctx->UNC); ctx->UNC = kstrndup(devname, pos - devname, GFP_KERNEL); if (!ctx->UNC) return -ENOMEM; convert_delimiter(ctx->UNC, '\\'); /* skip any delimiter */ if (*pos == '/' || *pos == '\\') pos++; kfree(ctx->prepath); ctx->prepath = NULL; /* If pos is NULL then no prepath */ if (!*pos) return 0; ctx->prepath = cifs_sanitize_prepath(pos, GFP_KERNEL); if (IS_ERR(ctx->prepath)) { rc = PTR_ERR(ctx->prepath); ctx->prepath = NULL; return rc; } return 0; } static void smb3_fs_context_free(struct fs_context *fc); static int smb3_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param); static int smb3_fs_context_parse_monolithic(struct fs_context *fc, void *data); static int smb3_get_tree(struct fs_context *fc); static int smb3_reconfigure(struct fs_context *fc); static const struct fs_context_operations smb3_fs_context_ops = { .free = smb3_fs_context_free, .parse_param = smb3_fs_context_parse_param, .parse_monolithic = smb3_fs_context_parse_monolithic, .get_tree = smb3_get_tree, .reconfigure = smb3_reconfigure, }; /* * Parse a monolithic block of data from sys_mount(). * smb3_fs_context_parse_monolithic - Parse key[=val][,key[=val]]* mount data * @ctx: The superblock configuration to fill in. * @data: The data to parse * * Parse a blob of data that's in key[=val][,key[=val]]* form. This can be * called from the ->monolithic_mount_data() fs_context operation. * * Returns 0 on success or the error returned by the ->parse_option() fs_context * operation on failure. */ static int smb3_fs_context_parse_monolithic(struct fs_context *fc, void *data) { char *options = data, *key; int ret = 0; if (!options) return 0; ret = security_sb_eat_lsm_opts(options, &fc->security); if (ret) return ret; /* BB Need to add support for sep= here TBD */ while ((key = strsep(&options, ",")) != NULL) { size_t len; char *value; if (*key == 0) break; /* Check if following character is the deliminator If yes, * we have encountered a double deliminator reset the NULL * character to the deliminator */ while (options && options[0] == ',') { len = strlen(key); strcpy(key + len, options); options = strchr(options, ','); if (options) *options++ = 0; } len = 0; value = strchr(key, '='); if (value) { if (value == key) continue; *value++ = 0; len = strlen(value); } ret = vfs_parse_fs_string(fc, key, value, len); if (ret < 0) break; } return ret; } /* * Validate the preparsed information in the config. */ static int smb3_fs_context_validate(struct fs_context *fc) { struct smb3_fs_context *ctx = smb3_fc2context(fc); if (ctx->rdma && ctx->vals->protocol_id < SMB30_PROT_ID) { cifs_errorf(fc, "SMB Direct requires Version >=3.0\n"); return -EOPNOTSUPP; } #ifndef CONFIG_KEYS /* Muliuser mounts require CONFIG_KEYS support */ if (ctx->multiuser) { cifs_errorf(fc, "Multiuser mounts require kernels with CONFIG_KEYS enabled\n"); return -1; } #endif if (ctx->got_version == false) pr_warn_once("No dialect specified on mount. Default has changed to a more secure dialect, SMB2.1 or later (e.g. SMB3.1.1), from CIFS (SMB1). To use the less secure SMB1 dialect to access old servers which do not support SMB3.1.1 (or even SMB3 or SMB2.1) specify vers=1.0 on mount.\n"); if (!ctx->UNC) { cifs_errorf(fc, "CIFS mount error: No usable UNC path provided in device string!\n"); return -1; } /* make sure UNC has a share name */ if (strlen(ctx->UNC) < 3 || !strchr(ctx->UNC + 3, '\\')) { cifs_errorf(fc, "Malformed UNC. Unable to find share name.\n"); return -ENOENT; } if (!ctx->got_ip) { int len; const char *slash; /* No ip= option specified? Try to get it from UNC */ /* Use the address part of the UNC. */ slash = strchr(&ctx->UNC[2], '\\'); len = slash - &ctx->UNC[2]; if (!cifs_convert_address((struct sockaddr *)&ctx->dstaddr, &ctx->UNC[2], len)) { pr_err("Unable to determine destination address\n"); return -EHOSTUNREACH; } } /* set the port that we got earlier */ cifs_set_port((struct sockaddr *)&ctx->dstaddr, ctx->port); if (ctx->override_uid && !ctx->uid_specified) { ctx->override_uid = 0; pr_notice("ignoring forceuid mount option specified with no uid= option\n"); } if (ctx->override_gid && !ctx->gid_specified) { ctx->override_gid = 0; pr_notice("ignoring forcegid mount option specified with no gid= option\n"); } return 0; } static int smb3_get_tree_common(struct fs_context *fc) { struct smb3_fs_context *ctx = smb3_fc2context(fc); struct dentry *root; int rc = 0; root = cifs_smb3_do_mount(fc->fs_type, 0, ctx); if (IS_ERR(root)) return PTR_ERR(root); fc->root = root; return rc; } /* * Create an SMB3 superblock from the parameters passed. */ static int smb3_get_tree(struct fs_context *fc) { int err = smb3_fs_context_validate(fc); int ret; if (err) return err; mutex_lock(&cifs_mount_mutex); ret = smb3_get_tree_common(fc); mutex_unlock(&cifs_mount_mutex); return ret; } static void smb3_fs_context_free(struct fs_context *fc) { struct smb3_fs_context *ctx = smb3_fc2context(fc); smb3_cleanup_fs_context(ctx); } /* * Compare the old and new proposed context during reconfigure * and check if the changes are compatible. */ static int smb3_verify_reconfigure_ctx(struct fs_context *fc, struct smb3_fs_context *new_ctx, struct smb3_fs_context *old_ctx, bool need_recon) { if (new_ctx->posix_paths != old_ctx->posix_paths) { cifs_errorf(fc, "can not change posixpaths during remount\n"); return -EINVAL; } if (new_ctx->sectype != old_ctx->sectype) { cifs_errorf(fc, "can not change sec during remount\n"); return -EINVAL; } if (new_ctx->multiuser != old_ctx->multiuser) { cifs_errorf(fc, "can not change multiuser during remount\n"); return -EINVAL; } if (new_ctx->UNC && (!old_ctx->UNC || strcmp(new_ctx->UNC, old_ctx->UNC))) { cifs_errorf(fc, "can not change UNC during remount\n"); return -EINVAL; } if (new_ctx->username && (!old_ctx->username || strcmp(new_ctx->username, old_ctx->username))) { cifs_errorf(fc, "can not change username during remount\n"); return -EINVAL; } if (new_ctx->password && (!old_ctx->password || strcmp(new_ctx->password, old_ctx->password))) { if (need_recon == false) { cifs_errorf(fc, "can not change password of active session during remount\n"); return -EINVAL; } else if (old_ctx->sectype == Kerberos) { cifs_errorf(fc, "can not change password for Kerberos via remount\n"); return -EINVAL; } } if (new_ctx->domainname && (!old_ctx->domainname || strcmp(new_ctx->domainname, old_ctx->domainname))) { cifs_errorf(fc, "can not change domainname during remount\n"); return -EINVAL; } if (strcmp(new_ctx->workstation_name, old_ctx->workstation_name)) { cifs_errorf(fc, "can not change workstation_name during remount\n"); return -EINVAL; } if (new_ctx->nodename && (!old_ctx->nodename || strcmp(new_ctx->nodename, old_ctx->nodename))) { cifs_errorf(fc, "can not change nodename during remount\n"); return -EINVAL; } if (new_ctx->iocharset && (!old_ctx->iocharset || strcmp(new_ctx->iocharset, old_ctx->iocharset))) { cifs_errorf(fc, "can not change iocharset during remount\n"); return -EINVAL; } return 0; } #define STEAL_STRING(cifs_sb, ctx, field) \ do { \ kfree(ctx->field); \ ctx->field = cifs_sb->ctx->field; \ cifs_sb->ctx->field = NULL; \ } while (0) #define STEAL_STRING_SENSITIVE(cifs_sb, ctx, field) \ do { \ kfree_sensitive(ctx->field); \ ctx->field = cifs_sb->ctx->field; \ cifs_sb->ctx->field = NULL; \ } while (0) static int smb3_reconfigure(struct fs_context *fc) { struct smb3_fs_context *ctx = smb3_fc2context(fc); struct dentry *root = fc->root; struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb); struct cifs_ses *ses = cifs_sb_master_tcon(cifs_sb)->ses; bool need_recon = false; int rc; if (ses->expired_pwd) need_recon = true; rc = smb3_verify_reconfigure_ctx(fc, ctx, cifs_sb->ctx, need_recon); if (rc) return rc; /* * We can not change UNC/username/password/domainname/ * workstation_name/nodename/iocharset * during reconnect so ignore what we have in the new context and * just use what we already have in cifs_sb->ctx. */ STEAL_STRING(cifs_sb, ctx, UNC); STEAL_STRING(cifs_sb, ctx, source); STEAL_STRING(cifs_sb, ctx, username); if (need_recon == false) STEAL_STRING_SENSITIVE(cifs_sb, ctx, password); else { kfree_sensitive(ses->password); ses->password = kstrdup(ctx->password, GFP_KERNEL); } STEAL_STRING(cifs_sb, ctx, domainname); STEAL_STRING(cifs_sb, ctx, nodename); STEAL_STRING(cifs_sb, ctx, iocharset); /* if rsize or wsize not passed in on remount, use previous values */ if (ctx->rsize == 0) ctx->rsize = cifs_sb->ctx->rsize; if (ctx->wsize == 0) ctx->wsize = cifs_sb->ctx->wsize; smb3_cleanup_fs_context_contents(cifs_sb->ctx); rc = smb3_fs_context_dup(cifs_sb->ctx, ctx); smb3_update_mnt_flags(cifs_sb); #ifdef CONFIG_CIFS_DFS_UPCALL if (!rc) rc = dfs_cache_remount_fs(cifs_sb); #endif return rc; } static int smb3_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct fs_parse_result result; struct smb3_fs_context *ctx = smb3_fc2context(fc); int i, opt; bool is_smb3 = !strcmp(fc->fs_type->name, "smb3"); bool skip_parsing = false; kuid_t uid; kgid_t gid; cifs_dbg(FYI, "CIFS: parsing cifs mount option '%s'\n", param->key); /* * fs_parse can not handle string options with an empty value so * we will need special handling of them. */ if (param->type == fs_value_is_string && param->string[0] == 0) { if (!strcmp("pass", param->key) || !strcmp("password", param->key)) { skip_parsing = true; opt = Opt_pass; } else if (!strcmp("user", param->key) || !strcmp("username", param->key)) { skip_parsing = true; opt = Opt_user; } } if (!skip_parsing) { opt = fs_parse(fc, smb3_fs_parameters, param, &result); if (opt < 0) return ctx->sloppy ? 1 : opt; } switch (opt) { case Opt_compress: ctx->compress = true; cifs_dbg(VFS, "SMB3 compression support is experimental\n"); break; case Opt_nodfs: ctx->nodfs = 1; break; case Opt_hard: if (result.negated) { if (ctx->retry == 1) cifs_dbg(VFS, "conflicting hard vs. soft mount options\n"); ctx->retry = 0; } else ctx->retry = 1; break; case Opt_soft: if (result.negated) ctx->retry = 1; else { if (ctx->retry == 1) cifs_dbg(VFS, "conflicting hard vs soft mount options\n"); ctx->retry = 0; } break; case Opt_mapposix: if (result.negated) ctx->remap = false; else { ctx->remap = true; ctx->sfu_remap = false; /* disable SFU mapping */ } break; case Opt_mapchars: if (result.negated) ctx->sfu_remap = false; else { ctx->sfu_remap = true; ctx->remap = false; /* disable SFM (mapposix) mapping */ } break; case Opt_user_xattr: if (result.negated) ctx->no_xattr = 1; else ctx->no_xattr = 0; break; case Opt_forceuid: if (result.negated) ctx->override_uid = 0; else ctx->override_uid = 1; break; case Opt_forcegid: if (result.negated) ctx->override_gid = 0; else ctx->override_gid = 1; break; case Opt_perm: if (result.negated) ctx->noperm = 1; else ctx->noperm = 0; break; case Opt_dynperm: if (result.negated) ctx->dynperm = 0; else ctx->dynperm = 1; break; case Opt_sfu: if (result.negated) ctx->sfu_emul = 0; else ctx->sfu_emul = 1; break; case Opt_noblocksend: ctx->noblocksnd = 1; break; case Opt_noautotune: ctx->noautotune = 1; break; case Opt_nolease: ctx->no_lease = 1; break; case Opt_nosparse: ctx->no_sparse = 1; break; case Opt_nodelete: ctx->nodelete = 1; break; case Opt_multichannel: if (result.negated) { ctx->multichannel = false; ctx->max_channels = 1; } else { ctx->multichannel = true; /* if number of channels not specified, default to 2 */ if (ctx->max_channels < 2) ctx->max_channels = 2; } break; case Opt_uid: uid = make_kuid(current_user_ns(), result.uint_32); if (!uid_valid(uid)) goto cifs_parse_mount_err; ctx->linux_uid = uid; ctx->uid_specified = true; break; case Opt_cruid: uid = make_kuid(current_user_ns(), result.uint_32); if (!uid_valid(uid)) goto cifs_parse_mount_err; ctx->cred_uid = uid; ctx->cruid_specified = true; break; case Opt_backupuid: uid = make_kuid(current_user_ns(), result.uint_32); if (!uid_valid(uid)) goto cifs_parse_mount_err; ctx->backupuid = uid; ctx->backupuid_specified = true; break; case Opt_backupgid: gid = make_kgid(current_user_ns(), result.uint_32); if (!gid_valid(gid)) goto cifs_parse_mount_err; ctx->backupgid = gid; ctx->backupgid_specified = true; break; case Opt_gid: gid = make_kgid(current_user_ns(), result.uint_32); if (!gid_valid(gid)) goto cifs_parse_mount_err; ctx->linux_gid = gid; ctx->gid_specified = true; break; case Opt_port: ctx->port = result.uint_32; break; case Opt_file_mode: ctx->file_mode = result.uint_32; break; case Opt_dirmode: ctx->dir_mode = result.uint_32; break; case Opt_min_enc_offload: ctx->min_offload = result.uint_32; break; case Opt_retrans: ctx->retrans = result.uint_32; break; case Opt_blocksize: /* * inode blocksize realistically should never need to be * less than 16K or greater than 16M and default is 1MB. * Note that small inode block sizes (e.g. 64K) can lead * to very poor performance of common tools like cp and scp */ if ((result.uint_32 < CIFS_MAX_MSGSIZE) || (result.uint_32 > (4 * SMB3_DEFAULT_IOSIZE))) { cifs_errorf(fc, "%s: Invalid blocksize\n", __func__); goto cifs_parse_mount_err; } ctx->bsize = result.uint_32; ctx->got_bsize = true; break; case Opt_rasize: /* * readahead size realistically should never need to be * less than 1M (CIFS_DEFAULT_IOSIZE) or greater than 32M * (perhaps an exception should be considered in the * for the case of a large number of channels * when multichannel is negotiated) since that would lead * to plenty of parallel I/O in flight to the server. * Note that smaller read ahead sizes would * hurt performance of common tools like cp and scp * which often trigger sequential i/o with read ahead */ if ((result.uint_32 > (8 * SMB3_DEFAULT_IOSIZE)) || (result.uint_32 < CIFS_DEFAULT_IOSIZE)) { cifs_errorf(fc, "%s: Invalid rasize %d vs. %d\n", __func__, result.uint_32, SMB3_DEFAULT_IOSIZE); goto cifs_parse_mount_err; } ctx->rasize = result.uint_32; break; case Opt_rsize: ctx->rsize = result.uint_32; ctx->got_rsize = true; break; case Opt_wsize: ctx->wsize = result.uint_32; ctx->got_wsize = true; if (ctx->wsize % PAGE_SIZE != 0) { ctx->wsize = round_down(ctx->wsize, PAGE_SIZE); if (ctx->wsize == 0) { ctx->wsize = PAGE_SIZE; cifs_dbg(VFS, "wsize too small, reset to minimum %ld\n", PAGE_SIZE); } else { cifs_dbg(VFS, "wsize rounded down to %d to multiple of PAGE_SIZE %ld\n", ctx->wsize, PAGE_SIZE); } } break; case Opt_acregmax: ctx->acregmax = HZ * result.uint_32; if (ctx->acregmax > CIFS_MAX_ACTIMEO) { cifs_errorf(fc, "acregmax too large\n"); goto cifs_parse_mount_err; } break; case Opt_acdirmax: ctx->acdirmax = HZ * result.uint_32; if (ctx->acdirmax > CIFS_MAX_ACTIMEO) { cifs_errorf(fc, "acdirmax too large\n"); goto cifs_parse_mount_err; } break; case Opt_actimeo: if (HZ * result.uint_32 > CIFS_MAX_ACTIMEO) { cifs_errorf(fc, "timeout too large\n"); goto cifs_parse_mount_err; } if ((ctx->acdirmax != CIFS_DEF_ACTIMEO) || (ctx->acregmax != CIFS_DEF_ACTIMEO)) { cifs_errorf(fc, "actimeo ignored since acregmax or acdirmax specified\n"); break; } ctx->acdirmax = ctx->acregmax = HZ * result.uint_32; break; case Opt_closetimeo: ctx->closetimeo = HZ * result.uint_32; if (ctx->closetimeo > SMB3_MAX_DCLOSETIMEO) { cifs_errorf(fc, "closetimeo too large\n"); goto cifs_parse_mount_err; } break; case Opt_echo_interval: ctx->echo_interval = result.uint_32; break; case Opt_snapshot: ctx->snapshot_time = result.uint_64; break; case Opt_max_credits: if (result.uint_32 < 20 || result.uint_32 > 60000) { cifs_errorf(fc, "%s: Invalid max_credits value\n", __func__); goto cifs_parse_mount_err; } ctx->max_credits = result.uint_32; break; case Opt_max_channels: if (result.uint_32 < 1 || result.uint_32 > CIFS_MAX_CHANNELS) { cifs_errorf(fc, "%s: Invalid max_channels value, needs to be 1-%d\n", __func__, CIFS_MAX_CHANNELS); goto cifs_parse_mount_err; } ctx->max_channels = result.uint_32; /* If more than one channel requested ... they want multichan */ if (result.uint_32 > 1) ctx->multichannel = true; break; case Opt_max_cached_dirs: if (result.uint_32 < 1) { cifs_errorf(fc, "%s: Invalid max_cached_dirs, needs to be 1 or more\n", __func__); goto cifs_parse_mount_err; } ctx->max_cached_dirs = result.uint_32; break; case Opt_handletimeout: ctx->handle_timeout = result.uint_32; if (ctx->handle_timeout > SMB3_MAX_HANDLE_TIMEOUT) { cifs_errorf(fc, "Invalid handle cache timeout, longer than 16 minutes\n"); goto cifs_parse_mount_err; } break; case Opt_source: kfree(ctx->UNC); ctx->UNC = NULL; switch (smb3_parse_devname(param->string, ctx)) { case 0: break; case -ENOMEM: cifs_errorf(fc, "Unable to allocate memory for devname\n"); goto cifs_parse_mount_err; case -EINVAL: cifs_errorf(fc, "Malformed UNC in devname\n"); goto cifs_parse_mount_err; default: cifs_errorf(fc, "Unknown error parsing devname\n"); goto cifs_parse_mount_err; } ctx->source = smb3_fs_context_fullpath(ctx, '/'); if (IS_ERR(ctx->source)) { ctx->source = NULL; cifs_errorf(fc, "OOM when copying UNC string\n"); goto cifs_parse_mount_err; } fc->source = kstrdup(ctx->source, GFP_KERNEL); if (fc->source == NULL) { cifs_errorf(fc, "OOM when copying UNC string\n"); goto cifs_parse_mount_err; } break; case Opt_user: kfree(ctx->username); ctx->username = NULL; if (ctx->nullauth) break; if (strlen(param->string) == 0) { /* null user, ie. anonymous authentication */ ctx->nullauth = 1; break; } if (strnlen(param->string, CIFS_MAX_USERNAME_LEN) > CIFS_MAX_USERNAME_LEN) { pr_warn("username too long\n"); goto cifs_parse_mount_err; } ctx->username = kstrdup(param->string, GFP_KERNEL); if (ctx->username == NULL) { cifs_errorf(fc, "OOM when copying username string\n"); goto cifs_parse_mount_err; } break; case Opt_pass: kfree_sensitive(ctx->password); ctx->password = NULL; if (strlen(param->string) == 0) break; ctx->password = kstrdup(param->string, GFP_KERNEL); if (ctx->password == NULL) { cifs_errorf(fc, "OOM when copying password string\n"); goto cifs_parse_mount_err; } break; case Opt_ip: if (strlen(param->string) == 0) { ctx->got_ip = false; break; } if (!cifs_convert_address((struct sockaddr *)&ctx->dstaddr, param->string, strlen(param->string))) { pr_err("bad ip= option (%s)\n", param->string); goto cifs_parse_mount_err; } ctx->got_ip = true; break; case Opt_domain: if (strnlen(param->string, CIFS_MAX_DOMAINNAME_LEN) == CIFS_MAX_DOMAINNAME_LEN) { pr_warn("domain name too long\n"); goto cifs_parse_mount_err; } kfree(ctx->domainname); ctx->domainname = kstrdup(param->string, GFP_KERNEL); if (ctx->domainname == NULL) { cifs_errorf(fc, "OOM when copying domainname string\n"); goto cifs_parse_mount_err; } cifs_dbg(FYI, "Domain name set\n"); break; case Opt_srcaddr: if (!cifs_convert_address( (struct sockaddr *)&ctx->srcaddr, param->string, strlen(param->string))) { pr_warn("Could not parse srcaddr: %s\n", param->string); goto cifs_parse_mount_err; } break; case Opt_iocharset: if (strnlen(param->string, 1024) >= 65) { pr_warn("iocharset name too long\n"); goto cifs_parse_mount_err; } if (strncasecmp(param->string, "default", 7) != 0) { kfree(ctx->iocharset); ctx->iocharset = kstrdup(param->string, GFP_KERNEL); if (ctx->iocharset == NULL) { cifs_errorf(fc, "OOM when copying iocharset string\n"); goto cifs_parse_mount_err; } } /* if iocharset not set then load_nls_default * is used by caller */ cifs_dbg(FYI, "iocharset set to %s\n", ctx->iocharset); break; case Opt_netbiosname: memset(ctx->source_rfc1001_name, 0x20, RFC1001_NAME_LEN); /* * FIXME: are there cases in which a comma can * be valid in workstation netbios name (and * need special handling)? */ for (i = 0; i < RFC1001_NAME_LEN; i++) { /* don't ucase netbiosname for user */ if (param->string[i] == 0) break; ctx->source_rfc1001_name[i] = param->string[i]; } /* The string has 16th byte zero still from * set at top of the function */ if (i == RFC1001_NAME_LEN && param->string[i] != 0) pr_warn("netbiosname longer than 15 truncated\n"); break; case Opt_servern: /* last byte, type, is 0x20 for servr type */ memset(ctx->target_rfc1001_name, 0x20, RFC1001_NAME_LEN_WITH_NULL); /* * BB are there cases in which a comma can be valid in this * workstation netbios name (and need special handling)? */ /* user or mount helper must uppercase the netbios name */ for (i = 0; i < 15; i++) { if (param->string[i] == 0) break; ctx->target_rfc1001_name[i] = param->string[i]; } /* The string has 16th byte zero still from set at top of function */ if (i == RFC1001_NAME_LEN && param->string[i] != 0) pr_warn("server netbiosname longer than 15 truncated\n"); break; case Opt_ver: /* version of mount userspace tools, not dialect */ /* If interface changes in mount.cifs bump to new ver */ if (strncasecmp(param->string, "1", 1) == 0) { if (strlen(param->string) > 1) { pr_warn("Bad mount helper ver=%s. Did you want SMB1 (CIFS) dialect and mean to type vers=1.0 instead?\n", param->string); goto cifs_parse_mount_err; } /* This is the default */ break; } /* For all other value, error */ pr_warn("Invalid mount helper version specified\n"); goto cifs_parse_mount_err; case Opt_vers: /* protocol version (dialect) */ if (cifs_parse_smb_version(fc, param->string, ctx, is_smb3) != 0) goto cifs_parse_mount_err; ctx->got_version = true; break; case Opt_sec: if (cifs_parse_security_flavors(fc, param->string, ctx) != 0) goto cifs_parse_mount_err; break; case Opt_cache: if (cifs_parse_cache_flavor(fc, param->string, ctx) != 0) goto cifs_parse_mount_err; break; case Opt_witness: #ifndef CONFIG_CIFS_SWN_UPCALL cifs_errorf(fc, "Witness support needs CONFIG_CIFS_SWN_UPCALL config option\n"); goto cifs_parse_mount_err; #endif ctx->witness = true; pr_warn_once("Witness protocol support is experimental\n"); break; case Opt_rootfs: #ifndef CONFIG_CIFS_ROOT cifs_dbg(VFS, "rootfs support requires CONFIG_CIFS_ROOT config option\n"); goto cifs_parse_mount_err; #endif ctx->rootfs = true; break; case Opt_posixpaths: if (result.negated) ctx->posix_paths = 0; else ctx->posix_paths = 1; break; case Opt_unix: if (result.negated) { if (ctx->linux_ext == 1) pr_warn_once("conflicting posix mount options specified\n"); ctx->linux_ext = 0; ctx->no_linux_ext = 1; } else { if (ctx->no_linux_ext == 1) pr_warn_once("conflicting posix mount options specified\n"); ctx->linux_ext = 1; ctx->no_linux_ext = 0; } break; case Opt_nocase: ctx->nocase = 1; break; case Opt_brl: if (result.negated) { /* * turn off mandatory locking in mode * if remote locking is turned off since the * local vfs will do advisory */ if (ctx->file_mode == (S_IALLUGO & ~(S_ISUID | S_IXGRP))) ctx->file_mode = S_IALLUGO; ctx->nobrl = 1; } else ctx->nobrl = 0; break; case Opt_handlecache: if (result.negated) ctx->nohandlecache = 1; else ctx->nohandlecache = 0; break; case Opt_forcemandatorylock: ctx->mand_lock = 1; break; case Opt_setuids: ctx->setuids = result.negated; break; case Opt_intr: ctx->intr = !result.negated; break; case Opt_setuidfromacl: ctx->setuidfromacl = 1; break; case Opt_strictsync: ctx->nostrictsync = result.negated; break; case Opt_serverino: ctx->server_ino = !result.negated; break; case Opt_rwpidforward: ctx->rwpidforward = 1; break; case Opt_modesid: ctx->mode_ace = 1; break; case Opt_cifsacl: ctx->cifs_acl = !result.negated; break; case Opt_acl: ctx->no_psx_acl = result.negated; break; case Opt_locallease: ctx->local_lease = 1; break; case Opt_sign: ctx->sign = true; break; case Opt_ignore_signature: ctx->sign = true; ctx->ignore_signature = true; break; case Opt_seal: /* we do not do the following in secFlags because seal * is a per tree connection (mount) not a per socket * or per-smb connection option in the protocol * vol->secFlg |= CIFSSEC_MUST_SEAL; */ ctx->seal = 1; break; case Opt_noac: pr_warn("Mount option noac not supported. Instead set /proc/fs/cifs/LookupCacheEnabled to 0\n"); break; case Opt_fsc: #ifndef CONFIG_CIFS_FSCACHE cifs_errorf(fc, "FS-Cache support needs CONFIG_CIFS_FSCACHE kernel config option set\n"); goto cifs_parse_mount_err; #endif ctx->fsc = true; break; case Opt_mfsymlinks: ctx->mfsymlinks = true; break; case Opt_multiuser: ctx->multiuser = true; break; case Opt_sloppy: ctx->sloppy = true; break; case Opt_nosharesock: ctx->nosharesock = true; break; case Opt_persistent: if (result.negated) { ctx->nopersistent = true; if (ctx->persistent) { cifs_errorf(fc, "persistenthandles mount options conflict\n"); goto cifs_parse_mount_err; } } else { ctx->persistent = true; if ((ctx->nopersistent) || (ctx->resilient)) { cifs_errorf(fc, "persistenthandles mount options conflict\n"); goto cifs_parse_mount_err; } } break; case Opt_resilient: if (result.negated) { ctx->resilient = false; /* already the default */ } else { ctx->resilient = true; if (ctx->persistent) { cifs_errorf(fc, "persistenthandles mount options conflict\n"); goto cifs_parse_mount_err; } } break; case Opt_tcp_nodelay: /* tcp nodelay should not usually be needed since we CORK/UNCORK the socket */ if (result.negated) ctx->sockopt_tcp_nodelay = false; else ctx->sockopt_tcp_nodelay = true; break; case Opt_domainauto: ctx->domainauto = true; break; case Opt_rdma: ctx->rdma = true; break; case Opt_reparse: if (parse_reparse_flavor(fc, param->string, ctx)) goto cifs_parse_mount_err; break; } /* case Opt_ignore: - is ignored as expected ... */ return 0; cifs_parse_mount_err: kfree_sensitive(ctx->password); ctx->password = NULL; return -EINVAL; } int smb3_init_fs_context(struct fs_context *fc) { struct smb3_fs_context *ctx; char *nodename = utsname()->nodename; int i; ctx = kzalloc(sizeof(struct smb3_fs_context), GFP_KERNEL); if (unlikely(!ctx)) return -ENOMEM; strscpy(ctx->workstation_name, nodename, sizeof(ctx->workstation_name)); /* * does not have to be perfect mapping since field is * informational, only used for servers that do not support * port 445 and it can be overridden at mount time */ memset(ctx->source_rfc1001_name, 0x20, RFC1001_NAME_LEN); for (i = 0; i < strnlen(nodename, RFC1001_NAME_LEN); i++) ctx->source_rfc1001_name[i] = toupper(nodename[i]); ctx->source_rfc1001_name[RFC1001_NAME_LEN] = 0; /* * null target name indicates to use *SMBSERVR default called name * if we end up sending RFC1001 session initialize */ ctx->target_rfc1001_name[0] = 0; ctx->cred_uid = current_uid(); ctx->linux_uid = current_uid(); ctx->linux_gid = current_gid(); /* By default 4MB read ahead size, 1MB block size */ ctx->bsize = CIFS_DEFAULT_IOSIZE; /* can improve cp performance significantly */ ctx->rasize = 0; /* 0 = use default (ie negotiated rsize) for read ahead pages */ /* * default to SFM style remapping of seven reserved characters * unless user overrides it or we negotiate CIFS POSIX where * it is unnecessary. Can not simultaneously use more than one mapping * since then readdir could list files that open could not open */ ctx->remap = true; /* default to only allowing write access to owner of the mount */ ctx->dir_mode = ctx->file_mode = S_IRUGO | S_IXUGO | S_IWUSR; /* ctx->retry default is 0 (i.e. "soft" limited retry not hard retry) */ /* default is always to request posix paths. */ ctx->posix_paths = 1; /* default to using server inode numbers where available */ ctx->server_ino = 1; /* default is to use strict cifs caching semantics */ ctx->strict_io = true; ctx->acregmax = CIFS_DEF_ACTIMEO; ctx->acdirmax = CIFS_DEF_ACTIMEO; ctx->closetimeo = SMB3_DEF_DCLOSETIMEO; ctx->max_cached_dirs = MAX_CACHED_FIDS; /* Most clients set timeout to 0, allows server to use its default */ ctx->handle_timeout = 0; /* See MS-SMB2 spec section 2.2.14.2.12 */ /* offer SMB2.1 and later (SMB3 etc). Secure and widely accepted */ ctx->ops = &smb30_operations; ctx->vals = &smbdefault_values; ctx->echo_interval = SMB_ECHO_INTERVAL_DEFAULT; /* default to no multichannel (single server connection) */ ctx->multichannel = false; ctx->max_channels = 1; ctx->backupuid_specified = false; /* no backup intent for a user */ ctx->backupgid_specified = false; /* no backup intent for a group */ ctx->retrans = 1; ctx->reparse_type = CIFS_REPARSE_TYPE_DEFAULT; /* * short int override_uid = -1; * short int override_gid = -1; * char *nodename = strdup(utsname()->nodename); * struct sockaddr *dstaddr = (struct sockaddr *)&vol->dstaddr; */ fc->fs_private = ctx; fc->ops = &smb3_fs_context_ops; return 0; } void smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx) { if (ctx == NULL) return; /* * Make sure this stays in sync with smb3_fs_context_dup() */ kfree(ctx->username); ctx->username = NULL; kfree_sensitive(ctx->password); ctx->password = NULL; kfree(ctx->server_hostname); ctx->server_hostname = NULL; kfree(ctx->UNC); ctx->UNC = NULL; kfree(ctx->source); ctx->source = NULL; kfree(ctx->domainname); ctx->domainname = NULL; kfree(ctx->nodename); ctx->nodename = NULL; kfree(ctx->iocharset); ctx->iocharset = NULL; kfree(ctx->prepath); ctx->prepath = NULL; kfree(ctx->leaf_fullpath); ctx->leaf_fullpath = NULL; } void smb3_cleanup_fs_context(struct smb3_fs_context *ctx) { if (!ctx) return; smb3_cleanup_fs_context_contents(ctx); kfree(ctx); } void smb3_update_mnt_flags(struct cifs_sb_info *cifs_sb) { struct smb3_fs_context *ctx = cifs_sb->ctx; if (ctx->nodfs) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_DFS; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_NO_DFS; if (ctx->noperm) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_NO_PERM; if (ctx->setuids) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SET_UID; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SET_UID; if (ctx->setuidfromacl) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UID_FROM_ACL; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_UID_FROM_ACL; if (ctx->server_ino) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SERVER_INUM; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; if (ctx->remap) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SFM_CHR; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_MAP_SFM_CHR; if (ctx->sfu_remap) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SPECIAL_CHR; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_MAP_SPECIAL_CHR; if (ctx->no_xattr) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_XATTR; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_NO_XATTR; if (ctx->sfu_emul) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_UNX_EMUL; if (ctx->nobrl) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_NO_BRL; if (ctx->nohandlecache) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_HANDLE_CACHE; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_NO_HANDLE_CACHE; if (ctx->nostrictsync) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOSSYNC; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_NOSSYNC; if (ctx->mand_lock) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOPOSIXBRL; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_NOPOSIXBRL; if (ctx->rwpidforward) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_RWPIDFORWARD; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_RWPIDFORWARD; if (ctx->mode_ace) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MODE_FROM_SID; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_MODE_FROM_SID; if (ctx->cifs_acl) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_CIFS_ACL; if (ctx->backupuid_specified) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPUID; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_CIFS_BACKUPUID; if (ctx->backupgid_specified) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPGID; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_CIFS_BACKUPGID; if (ctx->override_uid) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_OVERR_UID; if (ctx->override_gid) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_OVERR_GID; if (ctx->dynperm) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_DYNPERM; if (ctx->fsc) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_FSCACHE; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_FSCACHE; if (ctx->multiuser) cifs_sb->mnt_cifs_flags |= (CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_NO_PERM); else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_MULTIUSER; if (ctx->strict_io) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_STRICT_IO; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_STRICT_IO; if (ctx->direct_io) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_DIRECT_IO; if (ctx->mfsymlinks) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MF_SYMLINKS; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_MF_SYMLINKS; if (ctx->mfsymlinks) { if (ctx->sfu_emul) { /* * Our SFU ("Services for Unix" emulation does not allow * creating symlinks but does allow reading existing SFU * symlinks (it does allow both creating and reading SFU * style mknod and FIFOs though). When "mfsymlinks" and * "sfu" are both enabled at the same time, it allows * reading both types of symlinks, but will only create * them with mfsymlinks format. This allows better * Apple compatibility (probably better for Samba too) * while still recognizing old Windows style symlinks. */ cifs_dbg(VFS, "mount options mfsymlinks and sfu both enabled\n"); } } cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SHUTDOWN; return; } |
7 71 9 44 31 2 44 31 8 31 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_MSR_H #define _ASM_X86_MSR_H #include "msr-index.h" #ifndef __ASSEMBLY__ #include <asm/asm.h> #include <asm/errno.h> #include <asm/cpumask.h> #include <uapi/asm/msr.h> #include <asm/shared/msr.h> #include <linux/percpu.h> struct msr_info { u32 msr_no; struct msr reg; struct msr __percpu *msrs; int err; }; struct msr_regs_info { u32 *regs; int err; }; struct saved_msr { bool valid; struct msr_info info; }; struct saved_msrs { unsigned int num; struct saved_msr *array; }; /* * both i386 and x86_64 returns 64-bit value in edx:eax, but gcc's "A" * constraint has different meanings. For i386, "A" means exactly * edx:eax, while for x86_64 it doesn't mean rdx:rax or edx:eax. Instead, * it means rax *or* rdx. */ #ifdef CONFIG_X86_64 /* Using 64-bit values saves one instruction clearing the high half of low */ #define DECLARE_ARGS(val, low, high) unsigned long low, high #define EAX_EDX_VAL(val, low, high) ((low) | (high) << 32) #define EAX_EDX_RET(val, low, high) "=a" (low), "=d" (high) #else #define DECLARE_ARGS(val, low, high) unsigned long long val #define EAX_EDX_VAL(val, low, high) (val) #define EAX_EDX_RET(val, low, high) "=A" (val) #endif /* * Be very careful with includes. This header is prone to include loops. */ #include <asm/atomic.h> #include <linux/tracepoint-defs.h> #ifdef CONFIG_TRACEPOINTS DECLARE_TRACEPOINT(read_msr); DECLARE_TRACEPOINT(write_msr); DECLARE_TRACEPOINT(rdpmc); extern void do_trace_write_msr(unsigned int msr, u64 val, int failed); extern void do_trace_read_msr(unsigned int msr, u64 val, int failed); extern void do_trace_rdpmc(unsigned int msr, u64 val, int failed); #else static inline void do_trace_write_msr(unsigned int msr, u64 val, int failed) {} static inline void do_trace_read_msr(unsigned int msr, u64 val, int failed) {} static inline void do_trace_rdpmc(unsigned int msr, u64 val, int failed) {} #endif /* * __rdmsr() and __wrmsr() are the two primitives which are the bare minimum MSR * accessors and should not have any tracing or other functionality piggybacking * on them - those are *purely* for accessing MSRs and nothing more. So don't even * think of extending them - you will be slapped with a stinking trout or a frozen * shark will reach you, wherever you are! You've been warned. */ static __always_inline unsigned long long __rdmsr(unsigned int msr) { DECLARE_ARGS(val, low, high); asm volatile("1: rdmsr\n" "2:\n" _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR) : EAX_EDX_RET(val, low, high) : "c" (msr)); return EAX_EDX_VAL(val, low, high); } static __always_inline void __wrmsr(unsigned int msr, u32 low, u32 high) { asm volatile("1: wrmsr\n" "2:\n" _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR) : : "c" (msr), "a"(low), "d" (high) : "memory"); } /* * WRMSRNS behaves exactly like WRMSR with the only difference being * that it is not a serializing instruction by default. */ static __always_inline void __wrmsrns(u32 msr, u32 low, u32 high) { /* Instruction opcode for WRMSRNS; supported in binutils >= 2.40. */ asm volatile("1: .byte 0x0f,0x01,0xc6\n" "2:\n" _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR) : : "c" (msr), "a"(low), "d" (high)); } #define native_rdmsr(msr, val1, val2) \ do { \ u64 __val = __rdmsr((msr)); \ (void)((val1) = (u32)__val); \ (void)((val2) = (u32)(__val >> 32)); \ } while (0) #define native_wrmsr(msr, low, high) \ __wrmsr(msr, low, high) #define native_wrmsrl(msr, val) \ __wrmsr((msr), (u32)((u64)(val)), \ (u32)((u64)(val) >> 32)) static inline unsigned long long native_read_msr(unsigned int msr) { unsigned long long val; val = __rdmsr(msr); if (tracepoint_enabled(read_msr)) do_trace_read_msr(msr, val, 0); return val; } static inline unsigned long long native_read_msr_safe(unsigned int msr, int *err) { DECLARE_ARGS(val, low, high); asm volatile("1: rdmsr ; xor %[err],%[err]\n" "2:\n\t" _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_RDMSR_SAFE, %[err]) : [err] "=r" (*err), EAX_EDX_RET(val, low, high) : "c" (msr)); if (tracepoint_enabled(read_msr)) do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), *err); return EAX_EDX_VAL(val, low, high); } /* Can be uninlined because referenced by paravirt */ static inline void notrace native_write_msr(unsigned int msr, u32 low, u32 high) { __wrmsr(msr, low, high); if (tracepoint_enabled(write_msr)) do_trace_write_msr(msr, ((u64)high << 32 | low), 0); } /* Can be uninlined because referenced by paravirt */ static inline int notrace native_write_msr_safe(unsigned int msr, u32 low, u32 high) { int err; asm volatile("1: wrmsr ; xor %[err],%[err]\n" "2:\n\t" _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_WRMSR_SAFE, %[err]) : [err] "=a" (err) : "c" (msr), "0" (low), "d" (high) : "memory"); if (tracepoint_enabled(write_msr)) do_trace_write_msr(msr, ((u64)high << 32 | low), err); return err; } extern int rdmsr_safe_regs(u32 regs[8]); extern int wrmsr_safe_regs(u32 regs[8]); /** * rdtsc() - returns the current TSC without ordering constraints * * rdtsc() returns the result of RDTSC as a 64-bit integer. The * only ordering constraint it supplies is the ordering implied by * "asm volatile": it will put the RDTSC in the place you expect. The * CPU can and will speculatively execute that RDTSC, though, so the * results can be non-monotonic if compared on different CPUs. */ static __always_inline unsigned long long rdtsc(void) { DECLARE_ARGS(val, low, high); asm volatile("rdtsc" : EAX_EDX_RET(val, low, high)); return EAX_EDX_VAL(val, low, high); } /** * rdtsc_ordered() - read the current TSC in program order * * rdtsc_ordered() returns the result of RDTSC as a 64-bit integer. * It is ordered like a load to a global in-memory counter. It should * be impossible to observe non-monotonic rdtsc_unordered() behavior * across multiple CPUs as long as the TSC is synced. */ static __always_inline unsigned long long rdtsc_ordered(void) { DECLARE_ARGS(val, low, high); /* * The RDTSC instruction is not ordered relative to memory * access. The Intel SDM and the AMD APM are both vague on this * point, but empirically an RDTSC instruction can be * speculatively executed before prior loads. An RDTSC * immediately after an appropriate barrier appears to be * ordered as a normal load, that is, it provides the same * ordering guarantees as reading from a global memory location * that some other imaginary CPU is updating continuously with a * time stamp. * * Thus, use the preferred barrier on the respective CPU, aiming for * RDTSCP as the default. */ asm volatile(ALTERNATIVE_2("rdtsc", "lfence; rdtsc", X86_FEATURE_LFENCE_RDTSC, "rdtscp", X86_FEATURE_RDTSCP) : EAX_EDX_RET(val, low, high) /* RDTSCP clobbers ECX with MSR_TSC_AUX. */ :: "ecx"); return EAX_EDX_VAL(val, low, high); } static inline unsigned long long native_read_pmc(int counter) { DECLARE_ARGS(val, low, high); asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter)); if (tracepoint_enabled(rdpmc)) do_trace_rdpmc(counter, EAX_EDX_VAL(val, low, high), 0); return EAX_EDX_VAL(val, low, high); } #ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else #include <linux/errno.h> /* * Access to machine-specific registers (available on 586 and better only) * Note: the rd* operations modify the parameters directly (without using * pointer indirection), this allows gcc to optimize better */ #define rdmsr(msr, low, high) \ do { \ u64 __val = native_read_msr((msr)); \ (void)((low) = (u32)__val); \ (void)((high) = (u32)(__val >> 32)); \ } while (0) static inline void wrmsr(unsigned int msr, u32 low, u32 high) { native_write_msr(msr, low, high); } #define rdmsrl(msr, val) \ ((val) = native_read_msr((msr))) static inline void wrmsrl(unsigned int msr, u64 val) { native_write_msr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32)); } /* wrmsr with exception handling */ static inline int wrmsr_safe(unsigned int msr, u32 low, u32 high) { return native_write_msr_safe(msr, low, high); } /* rdmsr with exception handling */ #define rdmsr_safe(msr, low, high) \ ({ \ int __err; \ u64 __val = native_read_msr_safe((msr), &__err); \ (*low) = (u32)__val; \ (*high) = (u32)(__val >> 32); \ __err; \ }) static inline int rdmsrl_safe(unsigned int msr, unsigned long long *p) { int err; *p = native_read_msr_safe(msr, &err); return err; } #define rdpmc(counter, low, high) \ do { \ u64 _l = native_read_pmc((counter)); \ (low) = (u32)_l; \ (high) = (u32)(_l >> 32); \ } while (0) #define rdpmcl(counter, val) ((val) = native_read_pmc(counter)) #endif /* !CONFIG_PARAVIRT_XXL */ static __always_inline void wrmsrns(u32 msr, u64 val) { __wrmsrns(msr, val, val >> 32); } /* * 64-bit version of wrmsr_safe(): */ static inline int wrmsrl_safe(u32 msr, u64 val) { return wrmsr_safe(msr, (u32)val, (u32)(val >> 32)); } struct msr __percpu *msrs_alloc(void); void msrs_free(struct msr __percpu *msrs); int msr_set_bit(u32 msr, u8 bit); int msr_clear_bit(u32 msr, u8 bit); #ifdef CONFIG_SMP int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q); int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q); void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs); void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs); int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q); int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q); int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]); int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]); #else /* CONFIG_SMP */ static inline int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) { rdmsr(msr_no, *l, *h); return 0; } static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) { wrmsr(msr_no, l, h); return 0; } static inline int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) { rdmsrl(msr_no, *q); return 0; } static inline int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q) { wrmsrl(msr_no, q); return 0; } static inline void rdmsr_on_cpus(const struct cpumask *m, u32 msr_no, struct msr __percpu *msrs) { rdmsr_on_cpu(0, msr_no, raw_cpu_ptr(&msrs->l), raw_cpu_ptr(&msrs->h)); } static inline void wrmsr_on_cpus(const struct cpumask *m, u32 msr_no, struct msr __percpu *msrs) { wrmsr_on_cpu(0, msr_no, raw_cpu_read(msrs->l), raw_cpu_read(msrs->h)); } static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) { return rdmsr_safe(msr_no, l, h); } static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) { return wrmsr_safe(msr_no, l, h); } static inline int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) { return rdmsrl_safe(msr_no, q); } static inline int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q) { return wrmsrl_safe(msr_no, q); } static inline int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) { return rdmsr_safe_regs(regs); } static inline int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) { return wrmsr_safe_regs(regs); } #endif /* CONFIG_SMP */ #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_MSR_H */ |
30 5 6 11 43 47 30 3 2 84 131 65 3 3 3 2 2 2 2 2 1 1 6 6 5 5 1 2 3 63 64 64 63 1 57 55 7 7 69 29 14 48 3 3 3 62 3 3 35 29 30 2 65 65 65 62 2 2 65 63 64 64 64 65 64 1 1 1 1 1 1 2 2 1 1 1 2 2 2 2 2 2 2 2 2 2 2 14 2 2 1 2 2 16 2 2 1 1 1 1 28 12 45 44 43 44 39 39 4 1 4 4 3 9 9 8 5 3 1 43 2 5 2 43 7 48 39 9 60 22 1 2 4 1 2 1 3 1 2 89 89 3 85 84 1 32 6 1 35 13 11 39 50 5 45 6 41 42 21 3 4 42 5 37 46 46 39 16 2 2 20 22 18 44 1 1 940 941 131 110 37 130 1 1 1 1 1 1 1 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 | // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2005 Voltaire Inc. All rights reserved. * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. * Copyright (c) 1999-2019, Mellanox Technologies, Inc. All rights reserved. * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. */ #include <linux/completion.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/mutex.h> #include <linux/random.h> #include <linux/rbtree.h> #include <linux/igmp.h> #include <linux/xarray.h> #include <linux/inetdevice.h> #include <linux/slab.h> #include <linux/module.h> #include <net/route.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/netevent.h> #include <net/tcp.h> #include <net/ipv6.h> #include <net/ip_fib.h> #include <net/ip6_route.h> #include <rdma/rdma_cm.h> #include <rdma/rdma_cm_ib.h> #include <rdma/rdma_netlink.h> #include <rdma/ib.h> #include <rdma/ib_cache.h> #include <rdma/ib_cm.h> #include <rdma/ib_sa.h> #include <rdma/iw_cm.h> #include "core_priv.h" #include "cma_priv.h" #include "cma_trace.h" MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("Generic RDMA CM Agent"); MODULE_LICENSE("Dual BSD/GPL"); #define CMA_CM_RESPONSE_TIMEOUT 20 #define CMA_MAX_CM_RETRIES 15 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) #define CMA_IBOE_PACKET_LIFETIME 16 #define CMA_PREFERRED_ROCE_GID_TYPE IB_GID_TYPE_ROCE_UDP_ENCAP static const char * const cma_events[] = { [RDMA_CM_EVENT_ADDR_RESOLVED] = "address resolved", [RDMA_CM_EVENT_ADDR_ERROR] = "address error", [RDMA_CM_EVENT_ROUTE_RESOLVED] = "route resolved ", [RDMA_CM_EVENT_ROUTE_ERROR] = "route error", [RDMA_CM_EVENT_CONNECT_REQUEST] = "connect request", [RDMA_CM_EVENT_CONNECT_RESPONSE] = "connect response", [RDMA_CM_EVENT_CONNECT_ERROR] = "connect error", [RDMA_CM_EVENT_UNREACHABLE] = "unreachable", [RDMA_CM_EVENT_REJECTED] = "rejected", [RDMA_CM_EVENT_ESTABLISHED] = "established", [RDMA_CM_EVENT_DISCONNECTED] = "disconnected", [RDMA_CM_EVENT_DEVICE_REMOVAL] = "device removal", [RDMA_CM_EVENT_MULTICAST_JOIN] = "multicast join", [RDMA_CM_EVENT_MULTICAST_ERROR] = "multicast error", [RDMA_CM_EVENT_ADDR_CHANGE] = "address change", [RDMA_CM_EVENT_TIMEWAIT_EXIT] = "timewait exit", }; static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, enum ib_gid_type gid_type); const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event) { size_t index = event; return (index < ARRAY_SIZE(cma_events) && cma_events[index]) ? cma_events[index] : "unrecognized event"; } EXPORT_SYMBOL(rdma_event_msg); const char *__attribute_const__ rdma_reject_msg(struct rdma_cm_id *id, int reason) { if (rdma_ib_or_roce(id->device, id->port_num)) return ibcm_reject_msg(reason); if (rdma_protocol_iwarp(id->device, id->port_num)) return iwcm_reject_msg(reason); WARN_ON_ONCE(1); return "unrecognized transport"; } EXPORT_SYMBOL(rdma_reject_msg); /** * rdma_is_consumer_reject - return true if the consumer rejected the connect * request. * @id: Communication identifier that received the REJECT event. * @reason: Value returned in the REJECT event status field. */ static bool rdma_is_consumer_reject(struct rdma_cm_id *id, int reason) { if (rdma_ib_or_roce(id->device, id->port_num)) return reason == IB_CM_REJ_CONSUMER_DEFINED; if (rdma_protocol_iwarp(id->device, id->port_num)) return reason == -ECONNREFUSED; WARN_ON_ONCE(1); return false; } const void *rdma_consumer_reject_data(struct rdma_cm_id *id, struct rdma_cm_event *ev, u8 *data_len) { const void *p; if (rdma_is_consumer_reject(id, ev->status)) { *data_len = ev->param.conn.private_data_len; p = ev->param.conn.private_data; } else { *data_len = 0; p = NULL; } return p; } EXPORT_SYMBOL(rdma_consumer_reject_data); /** * rdma_iw_cm_id() - return the iw_cm_id pointer for this cm_id. * @id: Communication Identifier */ struct iw_cm_id *rdma_iw_cm_id(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); if (id->device->node_type == RDMA_NODE_RNIC) return id_priv->cm_id.iw; return NULL; } EXPORT_SYMBOL(rdma_iw_cm_id); /** * rdma_res_to_id() - return the rdma_cm_id pointer for this restrack. * @res: rdma resource tracking entry pointer */ struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res) { struct rdma_id_private *id_priv = container_of(res, struct rdma_id_private, res); return &id_priv->id; } EXPORT_SYMBOL(rdma_res_to_id); static int cma_add_one(struct ib_device *device); static void cma_remove_one(struct ib_device *device, void *client_data); static struct ib_client cma_client = { .name = "cma", .add = cma_add_one, .remove = cma_remove_one }; static struct ib_sa_client sa_client; static LIST_HEAD(dev_list); static LIST_HEAD(listen_any_list); static DEFINE_MUTEX(lock); static struct rb_root id_table = RB_ROOT; /* Serialize operations of id_table tree */ static DEFINE_SPINLOCK(id_table_lock); static struct workqueue_struct *cma_wq; static unsigned int cma_pernet_id; struct cma_pernet { struct xarray tcp_ps; struct xarray udp_ps; struct xarray ipoib_ps; struct xarray ib_ps; }; static struct cma_pernet *cma_pernet(struct net *net) { return net_generic(net, cma_pernet_id); } static struct xarray *cma_pernet_xa(struct net *net, enum rdma_ucm_port_space ps) { struct cma_pernet *pernet = cma_pernet(net); switch (ps) { case RDMA_PS_TCP: return &pernet->tcp_ps; case RDMA_PS_UDP: return &pernet->udp_ps; case RDMA_PS_IPOIB: return &pernet->ipoib_ps; case RDMA_PS_IB: return &pernet->ib_ps; default: return NULL; } } struct id_table_entry { struct list_head id_list; struct rb_node rb_node; }; struct cma_device { struct list_head list; struct ib_device *device; struct completion comp; refcount_t refcount; struct list_head id_list; enum ib_gid_type *default_gid_type; u8 *default_roce_tos; }; struct rdma_bind_list { enum rdma_ucm_port_space ps; struct hlist_head owners; unsigned short port; }; static int cma_ps_alloc(struct net *net, enum rdma_ucm_port_space ps, struct rdma_bind_list *bind_list, int snum) { struct xarray *xa = cma_pernet_xa(net, ps); return xa_insert(xa, snum, bind_list, GFP_KERNEL); } static struct rdma_bind_list *cma_ps_find(struct net *net, enum rdma_ucm_port_space ps, int snum) { struct xarray *xa = cma_pernet_xa(net, ps); return xa_load(xa, snum); } static void cma_ps_remove(struct net *net, enum rdma_ucm_port_space ps, int snum) { struct xarray *xa = cma_pernet_xa(net, ps); xa_erase(xa, snum); } enum { CMA_OPTION_AFONLY, }; void cma_dev_get(struct cma_device *cma_dev) { refcount_inc(&cma_dev->refcount); } void cma_dev_put(struct cma_device *cma_dev) { if (refcount_dec_and_test(&cma_dev->refcount)) complete(&cma_dev->comp); } struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, void *cookie) { struct cma_device *cma_dev; struct cma_device *found_cma_dev = NULL; mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) if (filter(cma_dev->device, cookie)) { found_cma_dev = cma_dev; break; } if (found_cma_dev) cma_dev_get(found_cma_dev); mutex_unlock(&lock); return found_cma_dev; } int cma_get_default_gid_type(struct cma_device *cma_dev, u32 port) { if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; return cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)]; } int cma_set_default_gid_type(struct cma_device *cma_dev, u32 port, enum ib_gid_type default_gid_type) { unsigned long supported_gids; if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; if (default_gid_type == IB_GID_TYPE_IB && rdma_protocol_roce_eth_encap(cma_dev->device, port)) default_gid_type = IB_GID_TYPE_ROCE; supported_gids = roce_gid_type_mask_support(cma_dev->device, port); if (!(supported_gids & 1 << default_gid_type)) return -EINVAL; cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)] = default_gid_type; return 0; } int cma_get_default_roce_tos(struct cma_device *cma_dev, u32 port) { if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; return cma_dev->default_roce_tos[port - rdma_start_port(cma_dev->device)]; } int cma_set_default_roce_tos(struct cma_device *cma_dev, u32 port, u8 default_roce_tos) { if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; cma_dev->default_roce_tos[port - rdma_start_port(cma_dev->device)] = default_roce_tos; return 0; } struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev) { return cma_dev->device; } /* * Device removal can occur at anytime, so we need extra handling to * serialize notifying the user of device removal with other callbacks. * We do this by disabling removal notification while a callback is in process, * and reporting it after the callback completes. */ struct cma_multicast { struct rdma_id_private *id_priv; union { struct ib_sa_multicast *sa_mc; struct { struct work_struct work; struct rdma_cm_event event; } iboe_join; }; struct list_head list; void *context; struct sockaddr_storage addr; u8 join_state; }; struct cma_work { struct work_struct work; struct rdma_id_private *id; enum rdma_cm_state old_state; enum rdma_cm_state new_state; struct rdma_cm_event event; }; union cma_ip_addr { struct in6_addr ip6; struct { __be32 pad[3]; __be32 addr; } ip4; }; struct cma_hdr { u8 cma_version; u8 ip_version; /* IP version: 7:4 */ __be16 port; union cma_ip_addr src_addr; union cma_ip_addr dst_addr; }; #define CMA_VERSION 0x00 struct cma_req_info { struct sockaddr_storage listen_addr_storage; struct sockaddr_storage src_addr_storage; struct ib_device *device; union ib_gid local_gid; __be64 service_id; int port; bool has_gid; u16 pkey; }; static int cma_comp_exch(struct rdma_id_private *id_priv, enum rdma_cm_state comp, enum rdma_cm_state exch) { unsigned long flags; int ret; /* * The FSM uses a funny double locking where state is protected by both * the handler_mutex and the spinlock. State is not allowed to change * to/from a handler_mutex protected value without also holding * handler_mutex. */ if (comp == RDMA_CM_CONNECT || exch == RDMA_CM_CONNECT) lockdep_assert_held(&id_priv->handler_mutex); spin_lock_irqsave(&id_priv->lock, flags); if ((ret = (id_priv->state == comp))) id_priv->state = exch; spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr) { return hdr->ip_version >> 4; } static void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) { hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); } static struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv) { return (struct sockaddr *)&id_priv->id.route.addr.src_addr; } static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv) { return (struct sockaddr *)&id_priv->id.route.addr.dst_addr; } static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join) { struct in_device *in_dev = NULL; if (ndev) { rtnl_lock(); in_dev = __in_dev_get_rtnl(ndev); if (in_dev) { if (join) ip_mc_inc_group(in_dev, *(__be32 *)(mgid->raw + 12)); else ip_mc_dec_group(in_dev, *(__be32 *)(mgid->raw + 12)); } rtnl_unlock(); } return (in_dev) ? 0 : -ENODEV; } static int compare_netdev_and_ip(int ifindex_a, struct sockaddr *sa, struct id_table_entry *entry_b) { struct rdma_id_private *id_priv = list_first_entry( &entry_b->id_list, struct rdma_id_private, id_list_entry); int ifindex_b = id_priv->id.route.addr.dev_addr.bound_dev_if; struct sockaddr *sb = cma_dst_addr(id_priv); if (ifindex_a != ifindex_b) return (ifindex_a > ifindex_b) ? 1 : -1; if (sa->sa_family != sb->sa_family) return sa->sa_family - sb->sa_family; if (sa->sa_family == AF_INET && __builtin_object_size(sa, 0) >= sizeof(struct sockaddr_in)) { return memcmp(&((struct sockaddr_in *)sa)->sin_addr, &((struct sockaddr_in *)sb)->sin_addr, sizeof(((struct sockaddr_in *)sa)->sin_addr)); } if (sa->sa_family == AF_INET6 && __builtin_object_size(sa, 0) >= sizeof(struct sockaddr_in6)) { return ipv6_addr_cmp(&((struct sockaddr_in6 *)sa)->sin6_addr, &((struct sockaddr_in6 *)sb)->sin6_addr); } return -1; } static int cma_add_id_to_tree(struct rdma_id_private *node_id_priv) { struct rb_node **new, *parent = NULL; struct id_table_entry *this, *node; unsigned long flags; int result; node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) return -ENOMEM; spin_lock_irqsave(&id_table_lock, flags); new = &id_table.rb_node; while (*new) { this = container_of(*new, struct id_table_entry, rb_node); result = compare_netdev_and_ip( node_id_priv->id.route.addr.dev_addr.bound_dev_if, cma_dst_addr(node_id_priv), this); parent = *new; if (result < 0) new = &((*new)->rb_left); else if (result > 0) new = &((*new)->rb_right); else { list_add_tail(&node_id_priv->id_list_entry, &this->id_list); kfree(node); goto unlock; } } INIT_LIST_HEAD(&node->id_list); list_add_tail(&node_id_priv->id_list_entry, &node->id_list); rb_link_node(&node->rb_node, parent, new); rb_insert_color(&node->rb_node, &id_table); unlock: spin_unlock_irqrestore(&id_table_lock, flags); return 0; } static struct id_table_entry * node_from_ndev_ip(struct rb_root *root, int ifindex, struct sockaddr *sa) { struct rb_node *node = root->rb_node; struct id_table_entry *data; int result; while (node) { data = container_of(node, struct id_table_entry, rb_node); result = compare_netdev_and_ip(ifindex, sa, data); if (result < 0) node = node->rb_left; else if (result > 0) node = node->rb_right; else return data; } return NULL; } static void cma_remove_id_from_tree(struct rdma_id_private *id_priv) { struct id_table_entry *data; unsigned long flags; spin_lock_irqsave(&id_table_lock, flags); if (list_empty(&id_priv->id_list_entry)) goto out; data = node_from_ndev_ip(&id_table, id_priv->id.route.addr.dev_addr.bound_dev_if, cma_dst_addr(id_priv)); if (!data) goto out; list_del_init(&id_priv->id_list_entry); if (list_empty(&data->id_list)) { rb_erase(&data->rb_node, &id_table); kfree(data); } out: spin_unlock_irqrestore(&id_table_lock, flags); } static void _cma_attach_to_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { cma_dev_get(cma_dev); id_priv->cma_dev = cma_dev; id_priv->id.device = cma_dev->device; id_priv->id.route.addr.dev_addr.transport = rdma_node_get_transport(cma_dev->device->node_type); list_add_tail(&id_priv->device_item, &cma_dev->id_list); trace_cm_id_attach(id_priv, cma_dev->device); } static void cma_attach_to_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { _cma_attach_to_dev(id_priv, cma_dev); id_priv->gid_type = cma_dev->default_gid_type[id_priv->id.port_num - rdma_start_port(cma_dev->device)]; } static void cma_release_dev(struct rdma_id_private *id_priv) { mutex_lock(&lock); list_del_init(&id_priv->device_item); cma_dev_put(id_priv->cma_dev); id_priv->cma_dev = NULL; id_priv->id.device = NULL; if (id_priv->id.route.addr.dev_addr.sgid_attr) { rdma_put_gid_attr(id_priv->id.route.addr.dev_addr.sgid_attr); id_priv->id.route.addr.dev_addr.sgid_attr = NULL; } mutex_unlock(&lock); } static inline unsigned short cma_family(struct rdma_id_private *id_priv) { return id_priv->id.route.addr.src_addr.ss_family; } static int cma_set_default_qkey(struct rdma_id_private *id_priv) { struct ib_sa_mcmember_rec rec; int ret = 0; switch (id_priv->id.ps) { case RDMA_PS_UDP: case RDMA_PS_IB: id_priv->qkey = RDMA_UDP_QKEY; break; case RDMA_PS_IPOIB: ib_addr_get_mgid(&id_priv->id.route.addr.dev_addr, &rec.mgid); ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num, &rec.mgid, &rec); if (!ret) id_priv->qkey = be32_to_cpu(rec.qkey); break; default: break; } return ret; } static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey) { if (!qkey || (id_priv->qkey && (id_priv->qkey != qkey))) return -EINVAL; id_priv->qkey = qkey; return 0; } static void cma_translate_ib(struct sockaddr_ib *sib, struct rdma_dev_addr *dev_addr) { dev_addr->dev_type = ARPHRD_INFINIBAND; rdma_addr_set_sgid(dev_addr, (union ib_gid *) &sib->sib_addr); ib_addr_set_pkey(dev_addr, ntohs(sib->sib_pkey)); } static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) { int ret; if (addr->sa_family != AF_IB) { ret = rdma_translate_ip(addr, dev_addr); } else { cma_translate_ib((struct sockaddr_ib *) addr, dev_addr); ret = 0; } return ret; } static const struct ib_gid_attr * cma_validate_port(struct ib_device *device, u32 port, enum ib_gid_type gid_type, union ib_gid *gid, struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr = ERR_PTR(-ENODEV); int bound_if_index = dev_addr->bound_dev_if; int dev_type = dev_addr->dev_type; struct net_device *ndev = NULL; if (!rdma_dev_access_netns(device, id_priv->id.route.addr.dev_addr.net)) goto out; if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port)) goto out; if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port)) goto out; /* * For drivers that do not associate more than one net device with * their gid tables, such as iWARP drivers, it is sufficient to * return the first table entry. * * Other driver classes might be included in the future. */ if (rdma_protocol_iwarp(device, port)) { sgid_attr = rdma_get_gid_attr(device, port, 0); if (IS_ERR(sgid_attr)) goto out; rcu_read_lock(); ndev = rcu_dereference(sgid_attr->ndev); if (!net_eq(dev_net(ndev), dev_addr->net) || ndev->ifindex != bound_if_index) sgid_attr = ERR_PTR(-ENODEV); rcu_read_unlock(); goto out; } if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) { ndev = dev_get_by_index(dev_addr->net, bound_if_index); if (!ndev) goto out; } else { gid_type = IB_GID_TYPE_IB; } sgid_attr = rdma_find_gid_by_port(device, gid, gid_type, port, ndev); dev_put(ndev); out: return sgid_attr; } static void cma_bind_sgid_attr(struct rdma_id_private *id_priv, const struct ib_gid_attr *sgid_attr) { WARN_ON(id_priv->id.route.addr.dev_addr.sgid_attr); id_priv->id.route.addr.dev_addr.sgid_attr = sgid_attr; } /** * cma_acquire_dev_by_src_ip - Acquire cma device, port, gid attribute * based on source ip address. * @id_priv: cm_id which should be bound to cma device * * cma_acquire_dev_by_src_ip() binds cm id to cma device, port and GID attribute * based on source IP address. It returns 0 on success or error code otherwise. * It is applicable to active and passive side cm_id. */ static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr; union ib_gid gid, iboe_gid, *gidp; struct cma_device *cma_dev; enum ib_gid_type gid_type; int ret = -ENODEV; u32 port; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &iboe_gid); memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof(gid)); mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) { rdma_for_each_port (cma_dev->device, port) { gidp = rdma_protocol_roce(cma_dev->device, port) ? &iboe_gid : &gid; gid_type = cma_dev->default_gid_type[port - 1]; sgid_attr = cma_validate_port(cma_dev->device, port, gid_type, gidp, id_priv); if (!IS_ERR(sgid_attr)) { id_priv->id.port_num = port; cma_bind_sgid_attr(id_priv, sgid_attr); cma_attach_to_dev(id_priv, cma_dev); ret = 0; goto out; } } } out: mutex_unlock(&lock); return ret; } /** * cma_ib_acquire_dev - Acquire cma device, port and SGID attribute * @id_priv: cm id to bind to cma device * @listen_id_priv: listener cm id to match against * @req: Pointer to req structure containaining incoming * request information * cma_ib_acquire_dev() acquires cma device, port and SGID attribute when * rdma device matches for listen_id and incoming request. It also verifies * that a GID table entry is present for the source address. * Returns 0 on success, or returns error code otherwise. */ static int cma_ib_acquire_dev(struct rdma_id_private *id_priv, const struct rdma_id_private *listen_id_priv, struct cma_req_info *req) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr; enum ib_gid_type gid_type; union ib_gid gid; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; if (rdma_protocol_roce(req->device, req->port)) rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &gid); else memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof(gid)); gid_type = listen_id_priv->cma_dev->default_gid_type[req->port - 1]; sgid_attr = cma_validate_port(req->device, req->port, gid_type, &gid, id_priv); if (IS_ERR(sgid_attr)) return PTR_ERR(sgid_attr); id_priv->id.port_num = req->port; cma_bind_sgid_attr(id_priv, sgid_attr); /* Need to acquire lock to protect against reader * of cma_dev->id_list such as cma_netdev_callback() and * cma_process_remove(). */ mutex_lock(&lock); cma_attach_to_dev(id_priv, listen_id_priv->cma_dev); mutex_unlock(&lock); rdma_restrack_add(&id_priv->res); return 0; } static int cma_iw_acquire_dev(struct rdma_id_private *id_priv, const struct rdma_id_private *listen_id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr; struct cma_device *cma_dev; enum ib_gid_type gid_type; int ret = -ENODEV; union ib_gid gid; u32 port; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof(gid)); mutex_lock(&lock); cma_dev = listen_id_priv->cma_dev; port = listen_id_priv->id.port_num; gid_type = listen_id_priv->gid_type; sgid_attr = cma_validate_port(cma_dev->device, port, gid_type, &gid, id_priv); if (!IS_ERR(sgid_attr)) { id_priv->id.port_num = port; cma_bind_sgid_attr(id_priv, sgid_attr); ret = 0; goto out; } list_for_each_entry(cma_dev, &dev_list, list) { rdma_for_each_port (cma_dev->device, port) { if (listen_id_priv->cma_dev == cma_dev && listen_id_priv->id.port_num == port) continue; gid_type = cma_dev->default_gid_type[port - 1]; sgid_attr = cma_validate_port(cma_dev->device, port, gid_type, &gid, id_priv); if (!IS_ERR(sgid_attr)) { id_priv->id.port_num = port; cma_bind_sgid_attr(id_priv, sgid_attr); ret = 0; goto out; } } } out: if (!ret) { cma_attach_to_dev(id_priv, cma_dev); rdma_restrack_add(&id_priv->res); } mutex_unlock(&lock); return ret; } /* * Select the source IB device and address to reach the destination IB address. */ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) { struct cma_device *cma_dev, *cur_dev; struct sockaddr_ib *addr; union ib_gid gid, sgid, *dgid; unsigned int p; u16 pkey, index; enum ib_port_state port_state; int ret; int i; cma_dev = NULL; addr = (struct sockaddr_ib *) cma_dst_addr(id_priv); dgid = (union ib_gid *) &addr->sib_addr; pkey = ntohs(addr->sib_pkey); mutex_lock(&lock); list_for_each_entry(cur_dev, &dev_list, list) { rdma_for_each_port (cur_dev->device, p) { if (!rdma_cap_af_ib(cur_dev->device, p)) continue; if (ib_find_cached_pkey(cur_dev->device, p, pkey, &index)) continue; if (ib_get_cached_port_state(cur_dev->device, p, &port_state)) continue; for (i = 0; i < cur_dev->device->port_data[p].immutable.gid_tbl_len; ++i) { ret = rdma_query_gid(cur_dev->device, p, i, &gid); if (ret) continue; if (!memcmp(&gid, dgid, sizeof(gid))) { cma_dev = cur_dev; sgid = gid; id_priv->id.port_num = p; goto found; } if (!cma_dev && (gid.global.subnet_prefix == dgid->global.subnet_prefix) && port_state == IB_PORT_ACTIVE) { cma_dev = cur_dev; sgid = gid; id_priv->id.port_num = p; goto found; } } } } mutex_unlock(&lock); return -ENODEV; found: cma_attach_to_dev(id_priv, cma_dev); rdma_restrack_add(&id_priv->res); mutex_unlock(&lock); addr = (struct sockaddr_ib *)cma_src_addr(id_priv); memcpy(&addr->sib_addr, &sgid, sizeof(sgid)); cma_translate_ib(addr, &id_priv->id.route.addr.dev_addr); return 0; } static void cma_id_get(struct rdma_id_private *id_priv) { refcount_inc(&id_priv->refcount); } static void cma_id_put(struct rdma_id_private *id_priv) { if (refcount_dec_and_test(&id_priv->refcount)) complete(&id_priv->comp); } static struct rdma_id_private * __rdma_create_id(struct net *net, rdma_cm_event_handler event_handler, void *context, enum rdma_ucm_port_space ps, enum ib_qp_type qp_type, const struct rdma_id_private *parent) { struct rdma_id_private *id_priv; id_priv = kzalloc(sizeof *id_priv, GFP_KERNEL); if (!id_priv) return ERR_PTR(-ENOMEM); id_priv->state = RDMA_CM_IDLE; id_priv->id.context = context; id_priv->id.event_handler = event_handler; id_priv->id.ps = ps; id_priv->id.qp_type = qp_type; id_priv->tos_set = false; id_priv->timeout_set = false; id_priv->min_rnr_timer_set = false; id_priv->gid_type = IB_GID_TYPE_IB; spin_lock_init(&id_priv->lock); mutex_init(&id_priv->qp_mutex); init_completion(&id_priv->comp); refcount_set(&id_priv->refcount, 1); mutex_init(&id_priv->handler_mutex); INIT_LIST_HEAD(&id_priv->device_item); INIT_LIST_HEAD(&id_priv->id_list_entry); INIT_LIST_HEAD(&id_priv->listen_list); INIT_LIST_HEAD(&id_priv->mc_list); get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num); id_priv->id.route.addr.dev_addr.net = get_net(net); id_priv->seq_num &= 0x00ffffff; rdma_restrack_new(&id_priv->res, RDMA_RESTRACK_CM_ID); if (parent) rdma_restrack_parent_name(&id_priv->res, &parent->res); return id_priv; } struct rdma_cm_id * __rdma_create_kernel_id(struct net *net, rdma_cm_event_handler event_handler, void *context, enum rdma_ucm_port_space ps, enum ib_qp_type qp_type, const char *caller) { struct rdma_id_private *ret; ret = __rdma_create_id(net, event_handler, context, ps, qp_type, NULL); if (IS_ERR(ret)) return ERR_CAST(ret); rdma_restrack_set_name(&ret->res, caller); return &ret->id; } EXPORT_SYMBOL(__rdma_create_kernel_id); struct rdma_cm_id *rdma_create_user_id(rdma_cm_event_handler event_handler, void *context, enum rdma_ucm_port_space ps, enum ib_qp_type qp_type) { struct rdma_id_private *ret; ret = __rdma_create_id(current->nsproxy->net_ns, event_handler, context, ps, qp_type, NULL); if (IS_ERR(ret)) return ERR_CAST(ret); rdma_restrack_set_name(&ret->res, NULL); return &ret->id; } EXPORT_SYMBOL(rdma_create_user_id); static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) return ret; ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); if (ret) return ret; qp_attr.qp_state = IB_QPS_RTR; ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE); if (ret) return ret; qp_attr.qp_state = IB_QPS_RTS; qp_attr.sq_psn = 0; ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN); return ret; } static int cma_init_conn_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) return ret; return ib_modify_qp(qp, &qp_attr, qp_attr_mask); } int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr) { struct rdma_id_private *id_priv; struct ib_qp *qp; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (id->device != pd->device) { ret = -EINVAL; goto out_err; } qp_init_attr->port_num = id->port_num; qp = ib_create_qp(pd, qp_init_attr); if (IS_ERR(qp)) { ret = PTR_ERR(qp); goto out_err; } if (id->qp_type == IB_QPT_UD) ret = cma_init_ud_qp(id_priv, qp); else ret = cma_init_conn_qp(id_priv, qp); if (ret) goto out_destroy; id->qp = qp; id_priv->qp_num = qp->qp_num; id_priv->srq = (qp->srq != NULL); trace_cm_qp_create(id_priv, pd, qp_init_attr, 0); return 0; out_destroy: ib_destroy_qp(qp); out_err: trace_cm_qp_create(id_priv, pd, qp_init_attr, ret); return ret; } EXPORT_SYMBOL(rdma_create_qp); void rdma_destroy_qp(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); trace_cm_qp_destroy(id_priv); mutex_lock(&id_priv->qp_mutex); ib_destroy_qp(id_priv->id.qp); id_priv->id.qp = NULL; mutex_unlock(&id_priv->qp_mutex); } EXPORT_SYMBOL(rdma_destroy_qp); static int cma_modify_qp_rtr(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } /* Need to update QP attributes from default values. */ qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); if (ret) goto out; qp_attr.qp_state = IB_QPS_RTR; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; BUG_ON(id_priv->cma_dev->device != id_priv->id.device); if (conn_param) qp_attr.max_dest_rd_atomic = conn_param->responder_resources; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_modify_qp_rts(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } qp_attr.qp_state = IB_QPS_RTS; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; if (conn_param) qp_attr.max_rd_atomic = conn_param->initiator_depth; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_modify_qp_err(struct rdma_id_private *id_priv) { struct ib_qp_attr qp_attr; int ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } qp_attr.qp_state = IB_QPS_ERR; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, IB_QP_STATE); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; int ret; u16 pkey; if (rdma_cap_eth_ah(id_priv->id.device, id_priv->id.port_num)) pkey = 0xffff; else pkey = ib_addr_get_pkey(dev_addr); ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num, pkey, &qp_attr->pkey_index); if (ret) return ret; qp_attr->port_num = id_priv->id.port_num; *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT; if (id_priv->id.qp_type == IB_QPT_UD) { ret = cma_set_default_qkey(id_priv); if (ret) return ret; qp_attr->qkey = id_priv->qkey; *qp_attr_mask |= IB_QP_QKEY; } else { qp_attr->qp_access_flags = 0; *qp_attr_mask |= IB_QP_ACCESS_FLAGS; } return 0; } int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct rdma_id_private *id_priv; int ret = 0; id_priv = container_of(id, struct rdma_id_private, id); if (rdma_cap_ib_cm(id->device, id->port_num)) { if (!id_priv->cm_id.ib || (id_priv->id.qp_type == IB_QPT_UD)) ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask); else ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr, qp_attr_mask); if (qp_attr->qp_state == IB_QPS_RTR) qp_attr->rq_psn = id_priv->seq_num; } else if (rdma_cap_iw_cm(id->device, id->port_num)) { if (!id_priv->cm_id.iw) { qp_attr->qp_access_flags = 0; *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS; } else ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr, qp_attr_mask); qp_attr->port_num = id_priv->id.port_num; *qp_attr_mask |= IB_QP_PORT; } else { ret = -ENOSYS; } if ((*qp_attr_mask & IB_QP_TIMEOUT) && id_priv->timeout_set) qp_attr->timeout = id_priv->timeout; if ((*qp_attr_mask & IB_QP_MIN_RNR_TIMER) && id_priv->min_rnr_timer_set) qp_attr->min_rnr_timer = id_priv->min_rnr_timer; return ret; } EXPORT_SYMBOL(rdma_init_qp_attr); static inline bool cma_zero_addr(const struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: return ipv4_is_zeronet(((struct sockaddr_in *)addr)->sin_addr.s_addr); case AF_INET6: return ipv6_addr_any(&((struct sockaddr_in6 *)addr)->sin6_addr); case AF_IB: return ib_addr_any(&((struct sockaddr_ib *)addr)->sib_addr); default: return false; } } static inline bool cma_loopback_addr(const struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: return ipv4_is_loopback( ((struct sockaddr_in *)addr)->sin_addr.s_addr); case AF_INET6: return ipv6_addr_loopback( &((struct sockaddr_in6 *)addr)->sin6_addr); case AF_IB: return ib_addr_loopback( &((struct sockaddr_ib *)addr)->sib_addr); default: return false; } } static inline bool cma_any_addr(const struct sockaddr *addr) { return cma_zero_addr(addr) || cma_loopback_addr(addr); } static int cma_addr_cmp(const struct sockaddr *src, const struct sockaddr *dst) { if (src->sa_family != dst->sa_family) return -1; switch (src->sa_family) { case AF_INET: return ((struct sockaddr_in *)src)->sin_addr.s_addr != ((struct sockaddr_in *)dst)->sin_addr.s_addr; case AF_INET6: { struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *)src; struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *)dst; bool link_local; if (ipv6_addr_cmp(&src_addr6->sin6_addr, &dst_addr6->sin6_addr)) return 1; link_local = ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL; /* Link local must match their scope_ids */ return link_local ? (src_addr6->sin6_scope_id != dst_addr6->sin6_scope_id) : 0; } default: return ib_addr_cmp(&((struct sockaddr_ib *) src)->sib_addr, &((struct sockaddr_ib *) dst)->sib_addr); } } static __be16 cma_port(const struct sockaddr *addr) { struct sockaddr_ib *sib; switch (addr->sa_family) { case AF_INET: return ((struct sockaddr_in *) addr)->sin_port; case AF_INET6: return ((struct sockaddr_in6 *) addr)->sin6_port; case AF_IB: sib = (struct sockaddr_ib *) addr; return htons((u16) (be64_to_cpu(sib->sib_sid) & be64_to_cpu(sib->sib_sid_mask))); default: return 0; } } static inline int cma_any_port(const struct sockaddr *addr) { return !cma_port(addr); } static void cma_save_ib_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, const struct rdma_cm_id *listen_id, const struct sa_path_rec *path) { struct sockaddr_ib *listen_ib, *ib; listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr; if (src_addr) { ib = (struct sockaddr_ib *)src_addr; ib->sib_family = AF_IB; if (path) { ib->sib_pkey = path->pkey; ib->sib_flowinfo = path->flow_label; memcpy(&ib->sib_addr, &path->sgid, 16); ib->sib_sid = path->service_id; ib->sib_scope_id = 0; } else { ib->sib_pkey = listen_ib->sib_pkey; ib->sib_flowinfo = listen_ib->sib_flowinfo; ib->sib_addr = listen_ib->sib_addr; ib->sib_sid = listen_ib->sib_sid; ib->sib_scope_id = listen_ib->sib_scope_id; } ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL); } if (dst_addr) { ib = (struct sockaddr_ib *)dst_addr; ib->sib_family = AF_IB; if (path) { ib->sib_pkey = path->pkey; ib->sib_flowinfo = path->flow_label; memcpy(&ib->sib_addr, &path->dgid, 16); } } } static void cma_save_ip4_info(struct sockaddr_in *src_addr, struct sockaddr_in *dst_addr, struct cma_hdr *hdr, __be16 local_port) { if (src_addr) { *src_addr = (struct sockaddr_in) { .sin_family = AF_INET, .sin_addr.s_addr = hdr->dst_addr.ip4.addr, .sin_port = local_port, }; } if (dst_addr) { *dst_addr = (struct sockaddr_in) { .sin_family = AF_INET, .sin_addr.s_addr = hdr->src_addr.ip4.addr, .sin_port = hdr->port, }; } } static void cma_save_ip6_info(struct sockaddr_in6 *src_addr, struct sockaddr_in6 *dst_addr, struct cma_hdr *hdr, __be16 local_port) { if (src_addr) { *src_addr = (struct sockaddr_in6) { .sin6_family = AF_INET6, .sin6_addr = hdr->dst_addr.ip6, .sin6_port = local_port, }; } if (dst_addr) { *dst_addr = (struct sockaddr_in6) { .sin6_family = AF_INET6, .sin6_addr = hdr->src_addr.ip6, .sin6_port = hdr->port, }; } } static u16 cma_port_from_service_id(__be64 service_id) { return (u16)be64_to_cpu(service_id); } static int cma_save_ip_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, const struct ib_cm_event *ib_event, __be64 service_id) { struct cma_hdr *hdr; __be16 port; hdr = ib_event->private_data; if (hdr->cma_version != CMA_VERSION) return -EINVAL; port = htons(cma_port_from_service_id(service_id)); switch (cma_get_ip_ver(hdr)) { case 4: cma_save_ip4_info((struct sockaddr_in *)src_addr, (struct sockaddr_in *)dst_addr, hdr, port); break; case 6: cma_save_ip6_info((struct sockaddr_in6 *)src_addr, (struct sockaddr_in6 *)dst_addr, hdr, port); break; default: return -EAFNOSUPPORT; } return 0; } static int cma_save_net_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, const struct rdma_cm_id *listen_id, const struct ib_cm_event *ib_event, sa_family_t sa_family, __be64 service_id) { if (sa_family == AF_IB) { if (ib_event->event == IB_CM_REQ_RECEIVED) cma_save_ib_info(src_addr, dst_addr, listen_id, ib_event->param.req_rcvd.primary_path); else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) cma_save_ib_info(src_addr, dst_addr, listen_id, NULL); return 0; } return cma_save_ip_info(src_addr, dst_addr, ib_event, service_id); } static int cma_save_req_info(const struct ib_cm_event *ib_event, struct cma_req_info *req) { const struct ib_cm_req_event_param *req_param = &ib_event->param.req_rcvd; const struct ib_cm_sidr_req_event_param *sidr_param = &ib_event->param.sidr_req_rcvd; switch (ib_event->event) { case IB_CM_REQ_RECEIVED: req->device = req_param->listen_id->device; req->port = req_param->port; memcpy(&req->local_gid, &req_param->primary_path->sgid, sizeof(req->local_gid)); req->has_gid = true; req->service_id = req_param->primary_path->service_id; req->pkey = be16_to_cpu(req_param->primary_path->pkey); if (req->pkey != req_param->bth_pkey) pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and primary path P_Key (0x%x)\n" "RDMA CMA: in the future this may cause the request to be dropped\n", req_param->bth_pkey, req->pkey); break; case IB_CM_SIDR_REQ_RECEIVED: req->device = sidr_param->listen_id->device; req->port = sidr_param->port; req->has_gid = false; req->service_id = sidr_param->service_id; req->pkey = sidr_param->pkey; if (req->pkey != sidr_param->bth_pkey) pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and SIDR request payload P_Key (0x%x)\n" "RDMA CMA: in the future this may cause the request to be dropped\n", sidr_param->bth_pkey, req->pkey); break; default: return -EINVAL; } return 0; } static bool validate_ipv4_net_dev(struct net_device *net_dev, const struct sockaddr_in *dst_addr, const struct sockaddr_in *src_addr) { __be32 daddr = dst_addr->sin_addr.s_addr, saddr = src_addr->sin_addr.s_addr; struct fib_result res; struct flowi4 fl4; int err; bool ret; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || ipv4_is_lbcast(daddr) || ipv4_is_zeronet(saddr) || ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr) || ipv4_is_loopback(saddr)) return false; memset(&fl4, 0, sizeof(fl4)); fl4.flowi4_oif = net_dev->ifindex; fl4.daddr = daddr; fl4.saddr = saddr; rcu_read_lock(); err = fib_lookup(dev_net(net_dev), &fl4, &res, 0); ret = err == 0 && FIB_RES_DEV(res) == net_dev; rcu_read_unlock(); return ret; } static bool validate_ipv6_net_dev(struct net_device *net_dev, const struct sockaddr_in6 *dst_addr, const struct sockaddr_in6 *src_addr) { #if IS_ENABLED(CONFIG_IPV6) const int strict = ipv6_addr_type(&dst_addr->sin6_addr) & IPV6_ADDR_LINKLOCAL; struct rt6_info *rt = rt6_lookup(dev_net(net_dev), &dst_addr->sin6_addr, &src_addr->sin6_addr, net_dev->ifindex, NULL, strict); bool ret; if (!rt) return false; ret = rt->rt6i_idev->dev == net_dev; ip6_rt_put(rt); return ret; #else return false; #endif } static bool validate_net_dev(struct net_device *net_dev, const struct sockaddr *daddr, const struct sockaddr *saddr) { const struct sockaddr_in *daddr4 = (const struct sockaddr_in *)daddr; const struct sockaddr_in *saddr4 = (const struct sockaddr_in *)saddr; const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr; const struct sockaddr_in6 *saddr6 = (const struct sockaddr_in6 *)saddr; switch (daddr->sa_family) { case AF_INET: return saddr->sa_family == AF_INET && validate_ipv4_net_dev(net_dev, daddr4, saddr4); case AF_INET6: return saddr->sa_family == AF_INET6 && validate_ipv6_net_dev(net_dev, daddr6, saddr6); default: return false; } } static struct net_device * roce_get_net_dev_by_cm_event(const struct ib_cm_event *ib_event) { const struct ib_gid_attr *sgid_attr = NULL; struct net_device *ndev; if (ib_event->event == IB_CM_REQ_RECEIVED) sgid_attr = ib_event->param.req_rcvd.ppath_sgid_attr; else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) sgid_attr = ib_event->param.sidr_req_rcvd.sgid_attr; if (!sgid_attr) return NULL; rcu_read_lock(); ndev = rdma_read_gid_attr_ndev_rcu(sgid_attr); if (IS_ERR(ndev)) ndev = NULL; else dev_hold(ndev); rcu_read_unlock(); return ndev; } static struct net_device *cma_get_net_dev(const struct ib_cm_event *ib_event, struct cma_req_info *req) { struct sockaddr *listen_addr = (struct sockaddr *)&req->listen_addr_storage; struct sockaddr *src_addr = (struct sockaddr *)&req->src_addr_storage; struct net_device *net_dev; const union ib_gid *gid = req->has_gid ? &req->local_gid : NULL; int err; err = cma_save_ip_info(listen_addr, src_addr, ib_event, req->service_id); if (err) return ERR_PTR(err); if (rdma_protocol_roce(req->device, req->port)) net_dev = roce_get_net_dev_by_cm_event(ib_event); else net_dev = ib_get_net_dev_by_params(req->device, req->port, req->pkey, gid, listen_addr); if (!net_dev) return ERR_PTR(-ENODEV); return net_dev; } static enum rdma_ucm_port_space rdma_ps_from_service_id(__be64 service_id) { return (be64_to_cpu(service_id) >> 16) & 0xffff; } static bool cma_match_private_data(struct rdma_id_private *id_priv, const struct cma_hdr *hdr) { struct sockaddr *addr = cma_src_addr(id_priv); __be32 ip4_addr; struct in6_addr ip6_addr; if (cma_any_addr(addr) && !id_priv->afonly) return true; switch (addr->sa_family) { case AF_INET: ip4_addr = ((struct sockaddr_in *)addr)->sin_addr.s_addr; if (cma_get_ip_ver(hdr) != 4) return false; if (!cma_any_addr(addr) && hdr->dst_addr.ip4.addr != ip4_addr) return false; break; case AF_INET6: ip6_addr = ((struct sockaddr_in6 *)addr)->sin6_addr; if (cma_get_ip_ver(hdr) != 6) return false; if (!cma_any_addr(addr) && memcmp(&hdr->dst_addr.ip6, &ip6_addr, sizeof(ip6_addr))) return false; break; case AF_IB: return true; default: return false; } return true; } static bool cma_protocol_roce(const struct rdma_cm_id *id) { struct ib_device *device = id->device; const u32 port_num = id->port_num ?: rdma_start_port(device); return rdma_protocol_roce(device, port_num); } static bool cma_is_req_ipv6_ll(const struct cma_req_info *req) { const struct sockaddr *daddr = (const struct sockaddr *)&req->listen_addr_storage; const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr; /* Returns true if the req is for IPv6 link local */ return (daddr->sa_family == AF_INET6 && (ipv6_addr_type(&daddr6->sin6_addr) & IPV6_ADDR_LINKLOCAL)); } static bool cma_match_net_dev(const struct rdma_cm_id *id, const struct net_device *net_dev, const struct cma_req_info *req) { const struct rdma_addr *addr = &id->route.addr; if (!net_dev) /* This request is an AF_IB request */ return (!id->port_num || id->port_num == req->port) && (addr->src_addr.ss_family == AF_IB); /* * If the request is not for IPv6 link local, allow matching * request to any netdevice of the one or multiport rdma device. */ if (!cma_is_req_ipv6_ll(req)) return true; /* * Net namespaces must match, and if the listner is listening * on a specific netdevice than netdevice must match as well. */ if (net_eq(dev_net(net_dev), addr->dev_addr.net) && (!!addr->dev_addr.bound_dev_if == (addr->dev_addr.bound_dev_if == net_dev->ifindex))) return true; else return false; } static struct rdma_id_private *cma_find_listener( const struct rdma_bind_list *bind_list, const struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event, const struct cma_req_info *req, const struct net_device *net_dev) { struct rdma_id_private *id_priv, *id_priv_dev; lockdep_assert_held(&lock); if (!bind_list) return ERR_PTR(-EINVAL); hlist_for_each_entry(id_priv, &bind_list->owners, node) { if (cma_match_private_data(id_priv, ib_event->private_data)) { if (id_priv->id.device == cm_id->device && cma_match_net_dev(&id_priv->id, net_dev, req)) return id_priv; list_for_each_entry(id_priv_dev, &id_priv->listen_list, listen_item) { if (id_priv_dev->id.device == cm_id->device && cma_match_net_dev(&id_priv_dev->id, net_dev, req)) return id_priv_dev; } } } return ERR_PTR(-EINVAL); } static struct rdma_id_private * cma_ib_id_from_event(struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event, struct cma_req_info *req, struct net_device **net_dev) { struct rdma_bind_list *bind_list; struct rdma_id_private *id_priv; int err; err = cma_save_req_info(ib_event, req); if (err) return ERR_PTR(err); *net_dev = cma_get_net_dev(ib_event, req); if (IS_ERR(*net_dev)) { if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) { /* Assuming the protocol is AF_IB */ *net_dev = NULL; } else { return ERR_CAST(*net_dev); } } mutex_lock(&lock); /* * Net namespace might be getting deleted while route lookup, * cm_id lookup is in progress. Therefore, perform netdevice * validation, cm_id lookup under rcu lock. * RCU lock along with netdevice state check, synchronizes with * netdevice migrating to different net namespace and also avoids * case where net namespace doesn't get deleted while lookup is in * progress. * If the device state is not IFF_UP, its properties such as ifindex * and nd_net cannot be trusted to remain valid without rcu lock. * net/core/dev.c change_net_namespace() ensures to synchronize with * ongoing operations on net device after device is closed using * synchronize_net(). */ rcu_read_lock(); if (*net_dev) { /* * If netdevice is down, it is likely that it is administratively * down or it might be migrating to different namespace. * In that case avoid further processing, as the net namespace * or ifindex may change. */ if (((*net_dev)->flags & IFF_UP) == 0) { id_priv = ERR_PTR(-EHOSTUNREACH); goto err; } if (!validate_net_dev(*net_dev, (struct sockaddr *)&req->src_addr_storage, (struct sockaddr *)&req->listen_addr_storage)) { id_priv = ERR_PTR(-EHOSTUNREACH); goto err; } } bind_list = cma_ps_find(*net_dev ? dev_net(*net_dev) : &init_net, rdma_ps_from_service_id(req->service_id), cma_port_from_service_id(req->service_id)); id_priv = cma_find_listener(bind_list, cm_id, ib_event, req, *net_dev); err: rcu_read_unlock(); mutex_unlock(&lock); if (IS_ERR(id_priv) && *net_dev) { dev_put(*net_dev); *net_dev = NULL; } return id_priv; } static inline u8 cma_user_data_offset(struct rdma_id_private *id_priv) { return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr); } static void cma_cancel_route(struct rdma_id_private *id_priv) { if (rdma_cap_ib_sa(id_priv->id.device, id_priv->id.port_num)) { if (id_priv->query) ib_sa_cancel_query(id_priv->query_id, id_priv->query); } } static void _cma_cancel_listens(struct rdma_id_private *id_priv) { struct rdma_id_private *dev_id_priv; lockdep_assert_held(&lock); /* * Remove from listen_any_list to prevent added devices from spawning * additional listen requests. */ list_del_init(&id_priv->listen_any_item); while (!list_empty(&id_priv->listen_list)) { dev_id_priv = list_first_entry(&id_priv->listen_list, struct rdma_id_private, listen_item); /* sync with device removal to avoid duplicate destruction */ list_del_init(&dev_id_priv->device_item); list_del_init(&dev_id_priv->listen_item); mutex_unlock(&lock); rdma_destroy_id(&dev_id_priv->id); mutex_lock(&lock); } } static void cma_cancel_listens(struct rdma_id_private *id_priv) { mutex_lock(&lock); _cma_cancel_listens(id_priv); mutex_unlock(&lock); } static void cma_cancel_operation(struct rdma_id_private *id_priv, enum rdma_cm_state state) { switch (state) { case RDMA_CM_ADDR_QUERY: /* * We can avoid doing the rdma_addr_cancel() based on state, * only RDMA_CM_ADDR_QUERY has a work that could still execute. * Notice that the addr_handler work could still be exiting * outside this state, however due to the interaction with the * handler_mutex the work is guaranteed not to touch id_priv * during exit. */ rdma_addr_cancel(&id_priv->id.route.addr.dev_addr); break; case RDMA_CM_ROUTE_QUERY: cma_cancel_route(id_priv); break; case RDMA_CM_LISTEN: if (cma_any_addr(cma_src_addr(id_priv)) && !id_priv->cma_dev) cma_cancel_listens(id_priv); break; default: break; } } static void cma_release_port(struct rdma_id_private *id_priv) { struct rdma_bind_list *bind_list = id_priv->bind_list; struct net *net = id_priv->id.route.addr.dev_addr.net; if (!bind_list) return; mutex_lock(&lock); hlist_del(&id_priv->node); if (hlist_empty(&bind_list->owners)) { cma_ps_remove(net, bind_list->ps, bind_list->port); kfree(bind_list); } mutex_unlock(&lock); } static void destroy_mc(struct rdma_id_private *id_priv, struct cma_multicast *mc) { bool send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN); if (rdma_cap_ib_mcast(id_priv->id.device, id_priv->id.port_num)) ib_sa_free_multicast(mc->sa_mc); if (rdma_protocol_roce(id_priv->id.device, id_priv->id.port_num)) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct net_device *ndev = NULL; if (dev_addr->bound_dev_if) ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (ndev && !send_only) { enum ib_gid_type gid_type; union ib_gid mgid; gid_type = id_priv->cma_dev->default_gid_type [id_priv->id.port_num - rdma_start_port( id_priv->cma_dev->device)]; cma_iboe_set_mgid((struct sockaddr *)&mc->addr, &mgid, gid_type); cma_igmp_send(ndev, &mgid, false); } dev_put(ndev); cancel_work_sync(&mc->iboe_join.work); } kfree(mc); } static void cma_leave_mc_groups(struct rdma_id_private *id_priv) { struct cma_multicast *mc; while (!list_empty(&id_priv->mc_list)) { mc = list_first_entry(&id_priv->mc_list, struct cma_multicast, list); list_del(&mc->list); destroy_mc(id_priv, mc); } } static void _destroy_id(struct rdma_id_private *id_priv, enum rdma_cm_state state) { cma_cancel_operation(id_priv, state); rdma_restrack_del(&id_priv->res); cma_remove_id_from_tree(id_priv); if (id_priv->cma_dev) { if (rdma_cap_ib_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.ib) ib_destroy_cm_id(id_priv->cm_id.ib); } else if (rdma_cap_iw_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.iw) iw_destroy_cm_id(id_priv->cm_id.iw); } cma_leave_mc_groups(id_priv); cma_release_dev(id_priv); } cma_release_port(id_priv); cma_id_put(id_priv); wait_for_completion(&id_priv->comp); if (id_priv->internal_id) cma_id_put(id_priv->id.context); kfree(id_priv->id.route.path_rec); kfree(id_priv->id.route.path_rec_inbound); kfree(id_priv->id.route.path_rec_outbound); put_net(id_priv->id.route.addr.dev_addr.net); kfree(id_priv); } /* * destroy an ID from within the handler_mutex. This ensures that no other * handlers can start running concurrently. */ static void destroy_id_handler_unlock(struct rdma_id_private *id_priv) __releases(&idprv->handler_mutex) { enum rdma_cm_state state; unsigned long flags; trace_cm_id_destroy(id_priv); /* * Setting the state to destroyed under the handler mutex provides a * fence against calling handler callbacks. If this is invoked due to * the failure of a handler callback then it guarentees that no future * handlers will be called. */ lockdep_assert_held(&id_priv->handler_mutex); spin_lock_irqsave(&id_priv->lock, flags); state = id_priv->state; id_priv->state = RDMA_CM_DESTROYING; spin_unlock_irqrestore(&id_priv->lock, flags); mutex_unlock(&id_priv->handler_mutex); _destroy_id(id_priv, state); } void rdma_destroy_id(struct rdma_cm_id *id) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->handler_mutex); destroy_id_handler_unlock(id_priv); } EXPORT_SYMBOL(rdma_destroy_id); static int cma_rep_recv(struct rdma_id_private *id_priv) { int ret; ret = cma_modify_qp_rtr(id_priv, NULL); if (ret) goto reject; ret = cma_modify_qp_rts(id_priv, NULL); if (ret) goto reject; trace_cm_send_rtu(id_priv); ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0); if (ret) goto reject; return 0; reject: pr_debug_ratelimited("RDMA CM: CONNECT_ERROR: failed to handle reply. status %d\n", ret); cma_modify_qp_err(id_priv); trace_cm_send_rej(id_priv); ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, NULL, 0); return ret; } static void cma_set_rep_event_data(struct rdma_cm_event *event, const struct ib_cm_rep_event_param *rep_data, void *private_data) { event->param.conn.private_data = private_data; event->param.conn.private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE; event->param.conn.responder_resources = rep_data->responder_resources; event->param.conn.initiator_depth = rep_data->initiator_depth; event->param.conn.flow_control = rep_data->flow_control; event->param.conn.rnr_retry_count = rep_data->rnr_retry_count; event->param.conn.srq = rep_data->srq; event->param.conn.qp_num = rep_data->remote_qpn; event->ece.vendor_id = rep_data->ece.vendor_id; event->ece.attr_mod = rep_data->ece.attr_mod; } static int cma_cm_event_handler(struct rdma_id_private *id_priv, struct rdma_cm_event *event) { int ret; lockdep_assert_held(&id_priv->handler_mutex); trace_cm_event_handler(id_priv, event); ret = id_priv->id.event_handler(&id_priv->id, event); trace_cm_event_done(id_priv, event, ret); return ret; } static int cma_ib_handler(struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv = cm_id->context; struct rdma_cm_event event = {}; enum rdma_cm_state state; int ret; mutex_lock(&id_priv->handler_mutex); state = READ_ONCE(id_priv->state); if ((ib_event->event != IB_CM_TIMEWAIT_EXIT && state != RDMA_CM_CONNECT) || (ib_event->event == IB_CM_TIMEWAIT_EXIT && state != RDMA_CM_DISCONNECT)) goto out; switch (ib_event->event) { case IB_CM_REQ_ERROR: case IB_CM_REP_ERROR: event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -ETIMEDOUT; break; case IB_CM_REP_RECEIVED: if (state == RDMA_CM_CONNECT && (id_priv->id.qp_type != IB_QPT_UD)) { trace_cm_send_mra(id_priv); ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); } if (id_priv->id.qp) { event.status = cma_rep_recv(id_priv); event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR : RDMA_CM_EVENT_ESTABLISHED; } else { event.event = RDMA_CM_EVENT_CONNECT_RESPONSE; } cma_set_rep_event_data(&event, &ib_event->param.rep_rcvd, ib_event->private_data); break; case IB_CM_RTU_RECEIVED: case IB_CM_USER_ESTABLISHED: event.event = RDMA_CM_EVENT_ESTABLISHED; break; case IB_CM_DREQ_ERROR: event.status = -ETIMEDOUT; fallthrough; case IB_CM_DREQ_RECEIVED: case IB_CM_DREP_RECEIVED: if (!cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_DISCONNECT)) goto out; event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IB_CM_TIMEWAIT_EXIT: event.event = RDMA_CM_EVENT_TIMEWAIT_EXIT; break; case IB_CM_MRA_RECEIVED: /* ignore event */ goto out; case IB_CM_REJ_RECEIVED: pr_debug_ratelimited("RDMA CM: REJECTED: %s\n", rdma_reject_msg(&id_priv->id, ib_event->param.rej_rcvd.reason)); cma_modify_qp_err(id_priv); event.status = ib_event->param.rej_rcvd.reason; event.event = RDMA_CM_EVENT_REJECTED; event.param.conn.private_data = ib_event->private_data; event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE; break; default: pr_err("RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); goto out; } ret = cma_cm_event_handler(id_priv, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; destroy_id_handler_unlock(id_priv); return ret; } out: mutex_unlock(&id_priv->handler_mutex); return 0; } static struct rdma_id_private * cma_ib_new_conn_id(const struct rdma_cm_id *listen_id, const struct ib_cm_event *ib_event, struct net_device *net_dev) { struct rdma_id_private *listen_id_priv; struct rdma_id_private *id_priv; struct rdma_cm_id *id; struct rdma_route *rt; const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family; struct sa_path_rec *path = ib_event->param.req_rcvd.primary_path; const __be64 service_id = ib_event->param.req_rcvd.primary_path->service_id; int ret; listen_id_priv = container_of(listen_id, struct rdma_id_private, id); id_priv = __rdma_create_id(listen_id->route.addr.dev_addr.net, listen_id->event_handler, listen_id->context, listen_id->ps, ib_event->param.req_rcvd.qp_type, listen_id_priv); if (IS_ERR(id_priv)) return NULL; id = &id_priv->id; if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, (struct sockaddr *)&id->route.addr.dst_addr, listen_id, ib_event, ss_family, service_id)) goto err; rt = &id->route; rt->num_pri_alt_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1; rt->path_rec = kmalloc_array(rt->num_pri_alt_paths, sizeof(*rt->path_rec), GFP_KERNEL); if (!rt->path_rec) goto err; rt->path_rec[0] = *path; if (rt->num_pri_alt_paths == 2) rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; if (net_dev) { rdma_copy_src_l2_addr(&rt->addr.dev_addr, net_dev); } else { if (!cma_protocol_roce(listen_id) && cma_any_addr(cma_src_addr(id_priv))) { rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND; rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid); ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey)); } else if (!cma_any_addr(cma_src_addr(id_priv))) { ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr); if (ret) goto err; } } rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid); id_priv->state = RDMA_CM_CONNECT; return id_priv; err: rdma_destroy_id(id); return NULL; } static struct rdma_id_private * cma_ib_new_udp_id(const struct rdma_cm_id *listen_id, const struct ib_cm_event *ib_event, struct net_device *net_dev) { const struct rdma_id_private *listen_id_priv; struct rdma_id_private *id_priv; struct rdma_cm_id *id; const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family; struct net *net = listen_id->route.addr.dev_addr.net; int ret; listen_id_priv = container_of(listen_id, struct rdma_id_private, id); id_priv = __rdma_create_id(net, listen_id->event_handler, listen_id->context, listen_id->ps, IB_QPT_UD, listen_id_priv); if (IS_ERR(id_priv)) return NULL; id = &id_priv->id; if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, (struct sockaddr *)&id->route.addr.dst_addr, listen_id, ib_event, ss_family, ib_event->param.sidr_req_rcvd.service_id)) goto err; if (net_dev) { rdma_copy_src_l2_addr(&id->route.addr.dev_addr, net_dev); } else { if (!cma_any_addr(cma_src_addr(id_priv))) { ret = cma_translate_addr(cma_src_addr(id_priv), &id->route.addr.dev_addr); if (ret) goto err; } } id_priv->state = RDMA_CM_CONNECT; return id_priv; err: rdma_destroy_id(id); return NULL; } static void cma_set_req_event_data(struct rdma_cm_event *event, const struct ib_cm_req_event_param *req_data, void *private_data, int offset) { event->param.conn.private_data = private_data + offset; event->param.conn.private_data_len = IB_CM_REQ_PRIVATE_DATA_SIZE - offset; event->param.conn.responder_resources = req_data->responder_resources; event->param.conn.initiator_depth = req_data->initiator_depth; event->param.conn.flow_control = req_data->flow_control; event->param.conn.retry_count = req_data->retry_count; event->param.conn.rnr_retry_count = req_data->rnr_retry_count; event->param.conn.srq = req_data->srq; event->param.conn.qp_num = req_data->remote_qpn; event->ece.vendor_id = req_data->ece.vendor_id; event->ece.attr_mod = req_data->ece.attr_mod; } static int cma_ib_check_req_qp_type(const struct rdma_cm_id *id, const struct ib_cm_event *ib_event) { return (((ib_event->event == IB_CM_REQ_RECEIVED) && (ib_event->param.req_rcvd.qp_type == id->qp_type)) || ((ib_event->event == IB_CM_SIDR_REQ_RECEIVED) && (id->qp_type == IB_QPT_UD)) || (!id->qp_type)); } static int cma_ib_req_handler(struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event) { struct rdma_id_private *listen_id, *conn_id = NULL; struct rdma_cm_event event = {}; struct cma_req_info req = {}; struct net_device *net_dev; u8 offset; int ret; listen_id = cma_ib_id_from_event(cm_id, ib_event, &req, &net_dev); if (IS_ERR(listen_id)) return PTR_ERR(listen_id); trace_cm_req_handler(listen_id, ib_event->event); if (!cma_ib_check_req_qp_type(&listen_id->id, ib_event)) { ret = -EINVAL; goto net_dev_put; } mutex_lock(&listen_id->handler_mutex); if (READ_ONCE(listen_id->state) != RDMA_CM_LISTEN) { ret = -ECONNABORTED; goto err_unlock; } offset = cma_user_data_offset(listen_id); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) { conn_id = cma_ib_new_udp_id(&listen_id->id, ib_event, net_dev); event.param.ud.private_data = ib_event->private_data + offset; event.param.ud.private_data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset; } else { conn_id = cma_ib_new_conn_id(&listen_id->id, ib_event, net_dev); cma_set_req_event_data(&event, &ib_event->param.req_rcvd, ib_event->private_data, offset); } if (!conn_id) { ret = -ENOMEM; goto err_unlock; } mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); ret = cma_ib_acquire_dev(conn_id, listen_id, &req); if (ret) { destroy_id_handler_unlock(conn_id); goto err_unlock; } conn_id->cm_id.ib = cm_id; cm_id->context = conn_id; cm_id->cm_handler = cma_ib_handler; ret = cma_cm_event_handler(conn_id, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ conn_id->cm_id.ib = NULL; mutex_unlock(&listen_id->handler_mutex); destroy_id_handler_unlock(conn_id); goto net_dev_put; } if (READ_ONCE(conn_id->state) == RDMA_CM_CONNECT && conn_id->id.qp_type != IB_QPT_UD) { trace_cm_send_mra(cm_id->context); ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); } mutex_unlock(&conn_id->handler_mutex); err_unlock: mutex_unlock(&listen_id->handler_mutex); net_dev_put: dev_put(net_dev); return ret; } __be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr) { if (addr->sa_family == AF_IB) return ((struct sockaddr_ib *) addr)->sib_sid; return cpu_to_be64(((u64)id->ps << 16) + be16_to_cpu(cma_port(addr))); } EXPORT_SYMBOL(rdma_get_service_id); void rdma_read_gids(struct rdma_cm_id *cm_id, union ib_gid *sgid, union ib_gid *dgid) { struct rdma_addr *addr = &cm_id->route.addr; if (!cm_id->device) { if (sgid) memset(sgid, 0, sizeof(*sgid)); if (dgid) memset(dgid, 0, sizeof(*dgid)); return; } if (rdma_protocol_roce(cm_id->device, cm_id->port_num)) { if (sgid) rdma_ip2gid((struct sockaddr *)&addr->src_addr, sgid); if (dgid) rdma_ip2gid((struct sockaddr *)&addr->dst_addr, dgid); } else { if (sgid) rdma_addr_get_sgid(&addr->dev_addr, sgid); if (dgid) rdma_addr_get_dgid(&addr->dev_addr, dgid); } } EXPORT_SYMBOL(rdma_read_gids); static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) { struct rdma_id_private *id_priv = iw_id->context; struct rdma_cm_event event = {}; int ret = 0; struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) goto out; switch (iw_event->event) { case IW_CM_EVENT_CLOSE: event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IW_CM_EVENT_CONNECT_REPLY: memcpy(cma_src_addr(id_priv), laddr, rdma_addr_size(laddr)); memcpy(cma_dst_addr(id_priv), raddr, rdma_addr_size(raddr)); switch (iw_event->status) { case 0: event.event = RDMA_CM_EVENT_ESTABLISHED; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; break; case -ECONNRESET: case -ECONNREFUSED: event.event = RDMA_CM_EVENT_REJECTED; break; case -ETIMEDOUT: event.event = RDMA_CM_EVENT_UNREACHABLE; break; default: event.event = RDMA_CM_EVENT_CONNECT_ERROR; break; } break; case IW_CM_EVENT_ESTABLISHED: event.event = RDMA_CM_EVENT_ESTABLISHED; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; break; default: goto out; } event.status = iw_event->status; event.param.conn.private_data = iw_event->private_data; event.param.conn.private_data_len = iw_event->private_data_len; ret = cma_cm_event_handler(id_priv, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.iw = NULL; destroy_id_handler_unlock(id_priv); return ret; } out: mutex_unlock(&id_priv->handler_mutex); return ret; } static int iw_conn_req_handler(struct iw_cm_id *cm_id, struct iw_cm_event *iw_event) { struct rdma_id_private *listen_id, *conn_id; struct rdma_cm_event event = {}; int ret = -ECONNABORTED; struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; event.event = RDMA_CM_EVENT_CONNECT_REQUEST; event.param.conn.private_data = iw_event->private_data; event.param.conn.private_data_len = iw_event->private_data_len; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; listen_id = cm_id->context; mutex_lock(&listen_id->handler_mutex); if (READ_ONCE(listen_id->state) != RDMA_CM_LISTEN) goto out; /* Create a new RDMA id for the new IW CM ID */ conn_id = __rdma_create_id(listen_id->id.route.addr.dev_addr.net, listen_id->id.event_handler, listen_id->id.context, RDMA_PS_TCP, IB_QPT_RC, listen_id); if (IS_ERR(conn_id)) { ret = -ENOMEM; goto out; } mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); conn_id->state = RDMA_CM_CONNECT; ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr); if (ret) { mutex_unlock(&listen_id->handler_mutex); destroy_id_handler_unlock(conn_id); return ret; } ret = cma_iw_acquire_dev(conn_id, listen_id); if (ret) { mutex_unlock(&listen_id->handler_mutex); destroy_id_handler_unlock(conn_id); return ret; } conn_id->cm_id.iw = cm_id; cm_id->context = conn_id; cm_id->cm_handler = cma_iw_handler; memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr)); memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr)); ret = cma_cm_event_handler(conn_id, &event); if (ret) { /* User wants to destroy the CM ID */ conn_id->cm_id.iw = NULL; mutex_unlock(&listen_id->handler_mutex); destroy_id_handler_unlock(conn_id); return ret; } mutex_unlock(&conn_id->handler_mutex); out: mutex_unlock(&listen_id->handler_mutex); return ret; } static int cma_ib_listen(struct rdma_id_private *id_priv) { struct sockaddr *addr; struct ib_cm_id *id; __be64 svc_id; addr = cma_src_addr(id_priv); svc_id = rdma_get_service_id(&id_priv->id, addr); id = ib_cm_insert_listen(id_priv->id.device, cma_ib_req_handler, svc_id); if (IS_ERR(id)) return PTR_ERR(id); id_priv->cm_id.ib = id; return 0; } static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) { int ret; struct iw_cm_id *id; id = iw_create_cm_id(id_priv->id.device, iw_conn_req_handler, id_priv); if (IS_ERR(id)) return PTR_ERR(id); mutex_lock(&id_priv->qp_mutex); id->tos = id_priv->tos; id->tos_set = id_priv->tos_set; mutex_unlock(&id_priv->qp_mutex); id->afonly = id_priv->afonly; id_priv->cm_id.iw = id; memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv), rdma_addr_size(cma_src_addr(id_priv))); ret = iw_cm_listen(id_priv->cm_id.iw, backlog); if (ret) { iw_destroy_cm_id(id_priv->cm_id.iw); id_priv->cm_id.iw = NULL; } return ret; } static int cma_listen_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { struct rdma_id_private *id_priv = id->context; /* Listening IDs are always destroyed on removal */ if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) return -1; id->context = id_priv->id.context; id->event_handler = id_priv->id.event_handler; trace_cm_event_handler(id_priv, event); return id_priv->id.event_handler(id, event); } static int cma_listen_on_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev, struct rdma_id_private **to_destroy) { struct rdma_id_private *dev_id_priv; struct net *net = id_priv->id.route.addr.dev_addr.net; int ret; lockdep_assert_held(&lock); *to_destroy = NULL; if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev->device, 1)) return 0; dev_id_priv = __rdma_create_id(net, cma_listen_handler, id_priv, id_priv->id.ps, id_priv->id.qp_type, id_priv); if (IS_ERR(dev_id_priv)) return PTR_ERR(dev_id_priv); dev_id_priv->state = RDMA_CM_ADDR_BOUND; memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv), rdma_addr_size(cma_src_addr(id_priv))); _cma_attach_to_dev(dev_id_priv, cma_dev); rdma_restrack_add(&dev_id_priv->res); cma_id_get(id_priv); dev_id_priv->internal_id = 1; dev_id_priv->afonly = id_priv->afonly; mutex_lock(&id_priv->qp_mutex); dev_id_priv->tos_set = id_priv->tos_set; dev_id_priv->tos = id_priv->tos; mutex_unlock(&id_priv->qp_mutex); ret = rdma_listen(&dev_id_priv->id, id_priv->backlog); if (ret) goto err_listen; list_add_tail(&dev_id_priv->listen_item, &id_priv->listen_list); return 0; err_listen: /* Caller must destroy this after releasing lock */ *to_destroy = dev_id_priv; dev_warn(&cma_dev->device->dev, "RDMA CMA: %s, error %d\n", __func__, ret); return ret; } static int cma_listen_on_all(struct rdma_id_private *id_priv) { struct rdma_id_private *to_destroy; struct cma_device *cma_dev; int ret; mutex_lock(&lock); list_add_tail(&id_priv->listen_any_item, &listen_any_list); list_for_each_entry(cma_dev, &dev_list, list) { ret = cma_listen_on_dev(id_priv, cma_dev, &to_destroy); if (ret) { /* Prevent racing with cma_process_remove() */ if (to_destroy) list_del_init(&to_destroy->device_item); goto err_listen; } } mutex_unlock(&lock); return 0; err_listen: _cma_cancel_listens(id_priv); mutex_unlock(&lock); if (to_destroy) rdma_destroy_id(&to_destroy->id); return ret; } void rdma_set_service_type(struct rdma_cm_id *id, int tos) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->qp_mutex); id_priv->tos = (u8) tos; id_priv->tos_set = true; mutex_unlock(&id_priv->qp_mutex); } EXPORT_SYMBOL(rdma_set_service_type); /** * rdma_set_ack_timeout() - Set the ack timeout of QP associated * with a connection identifier. * @id: Communication identifier to associated with service type. * @timeout: Ack timeout to set a QP, expressed as 4.096 * 2^(timeout) usec. * * This function should be called before rdma_connect() on active side, * and on passive side before rdma_accept(). It is applicable to primary * path only. The timeout will affect the local side of the QP, it is not * negotiated with remote side and zero disables the timer. In case it is * set before rdma_resolve_route, the value will also be used to determine * PacketLifeTime for RoCE. * * Return: 0 for success */ int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout) { struct rdma_id_private *id_priv; if (id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_INI) return -EINVAL; id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->qp_mutex); id_priv->timeout = timeout; id_priv->timeout_set = true; mutex_unlock(&id_priv->qp_mutex); return 0; } EXPORT_SYMBOL(rdma_set_ack_timeout); /** * rdma_set_min_rnr_timer() - Set the minimum RNR Retry timer of the * QP associated with a connection identifier. * @id: Communication identifier to associated with service type. * @min_rnr_timer: 5-bit value encoded as Table 45: "Encoding for RNR NAK * Timer Field" in the IBTA specification. * * This function should be called before rdma_connect() on active * side, and on passive side before rdma_accept(). The timer value * will be associated with the local QP. When it receives a send it is * not read to handle, typically if the receive queue is empty, an RNR * Retry NAK is returned to the requester with the min_rnr_timer * encoded. The requester will then wait at least the time specified * in the NAK before retrying. The default is zero, which translates * to a minimum RNR Timer value of 655 ms. * * Return: 0 for success */ int rdma_set_min_rnr_timer(struct rdma_cm_id *id, u8 min_rnr_timer) { struct rdma_id_private *id_priv; /* It is a five-bit value */ if (min_rnr_timer & 0xe0) return -EINVAL; if (WARN_ON(id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_TGT)) return -EINVAL; id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->qp_mutex); id_priv->min_rnr_timer = min_rnr_timer; id_priv->min_rnr_timer_set = true; mutex_unlock(&id_priv->qp_mutex); return 0; } EXPORT_SYMBOL(rdma_set_min_rnr_timer); static int route_set_path_rec_inbound(struct cma_work *work, struct sa_path_rec *path_rec) { struct rdma_route *route = &work->id->id.route; if (!route->path_rec_inbound) { route->path_rec_inbound = kzalloc(sizeof(*route->path_rec_inbound), GFP_KERNEL); if (!route->path_rec_inbound) return -ENOMEM; } *route->path_rec_inbound = *path_rec; return 0; } static int route_set_path_rec_outbound(struct cma_work *work, struct sa_path_rec *path_rec) { struct rdma_route *route = &work->id->id.route; if (!route->path_rec_outbound) { route->path_rec_outbound = kzalloc(sizeof(*route->path_rec_outbound), GFP_KERNEL); if (!route->path_rec_outbound) return -ENOMEM; } *route->path_rec_outbound = *path_rec; return 0; } static void cma_query_handler(int status, struct sa_path_rec *path_rec, unsigned int num_prs, void *context) { struct cma_work *work = context; struct rdma_route *route; int i; route = &work->id->id.route; if (status) goto fail; for (i = 0; i < num_prs; i++) { if (!path_rec[i].flags || (path_rec[i].flags & IB_PATH_GMP)) *route->path_rec = path_rec[i]; else if (path_rec[i].flags & IB_PATH_INBOUND) status = route_set_path_rec_inbound(work, &path_rec[i]); else if (path_rec[i].flags & IB_PATH_OUTBOUND) status = route_set_path_rec_outbound(work, &path_rec[i]); else status = -EINVAL; if (status) goto fail; } route->num_pri_alt_paths = 1; queue_work(cma_wq, &work->work); return; fail: work->old_state = RDMA_CM_ROUTE_QUERY; work->new_state = RDMA_CM_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_ERROR; work->event.status = status; pr_debug_ratelimited("RDMA CM: ROUTE_ERROR: failed to query path. status %d\n", status); queue_work(cma_wq, &work->work); } static int cma_query_ib_route(struct rdma_id_private *id_priv, unsigned long timeout_ms, struct cma_work *work) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct sa_path_rec path_rec; ib_sa_comp_mask comp_mask; struct sockaddr_in6 *sin6; struct sockaddr_ib *sib; memset(&path_rec, 0, sizeof path_rec); if (rdma_cap_opa_ah(id_priv->id.device, id_priv->id.port_num)) path_rec.rec_type = SA_PATH_REC_TYPE_OPA; else path_rec.rec_type = SA_PATH_REC_TYPE_IB; rdma_addr_get_sgid(dev_addr, &path_rec.sgid); rdma_addr_get_dgid(dev_addr, &path_rec.dgid); path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); path_rec.numb_path = 1; path_rec.reversible = 1; path_rec.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH | IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID; switch (cma_family(id_priv)) { case AF_INET: path_rec.qos_class = cpu_to_be16((u16) id_priv->tos); comp_mask |= IB_SA_PATH_REC_QOS_CLASS; break; case AF_INET6: sin6 = (struct sockaddr_in6 *) cma_src_addr(id_priv); path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20); comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; break; case AF_IB: sib = (struct sockaddr_ib *) cma_src_addr(id_priv); path_rec.traffic_class = (u8) (be32_to_cpu(sib->sib_flowinfo) >> 20); comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; break; } id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device, id_priv->id.port_num, &path_rec, comp_mask, timeout_ms, GFP_KERNEL, cma_query_handler, work, &id_priv->query); return (id_priv->query_id < 0) ? id_priv->query_id : 0; } static void cma_iboe_join_work_handler(struct work_struct *work) { struct cma_multicast *mc = container_of(work, struct cma_multicast, iboe_join.work); struct rdma_cm_event *event = &mc->iboe_join.event; struct rdma_id_private *id_priv = mc->id_priv; int ret; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) goto out_unlock; ret = cma_cm_event_handler(id_priv, event); WARN_ON(ret); out_unlock: mutex_unlock(&id_priv->handler_mutex); if (event->event == RDMA_CM_EVENT_MULTICAST_JOIN) rdma_destroy_ah_attr(&event->param.ud.ah_attr); } static void cma_work_handler(struct work_struct *_work) { struct cma_work *work = container_of(_work, struct cma_work, work); struct rdma_id_private *id_priv = work->id; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) goto out_unlock; if (work->old_state != 0 || work->new_state != 0) { if (!cma_comp_exch(id_priv, work->old_state, work->new_state)) goto out_unlock; } if (cma_cm_event_handler(id_priv, &work->event)) { cma_id_put(id_priv); destroy_id_handler_unlock(id_priv); goto out_free; } out_unlock: mutex_unlock(&id_priv->handler_mutex); cma_id_put(id_priv); out_free: if (work->event.event == RDMA_CM_EVENT_MULTICAST_JOIN) rdma_destroy_ah_attr(&work->event.param.ud.ah_attr); kfree(work); } static void cma_init_resolve_route_work(struct cma_work *work, struct rdma_id_private *id_priv) { work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ROUTE_QUERY; work->new_state = RDMA_CM_ROUTE_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; } static void enqueue_resolve_addr_work(struct cma_work *work, struct rdma_id_private *id_priv) { /* Balances with cma_id_put() in cma_work_handler */ cma_id_get(id_priv); work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ADDR_QUERY; work->new_state = RDMA_CM_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; queue_work(cma_wq, &work->work); } static int cma_resolve_ib_route(struct rdma_id_private *id_priv, unsigned long timeout_ms) { struct rdma_route *route = &id_priv->id.route; struct cma_work *work; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; cma_init_resolve_route_work(work, id_priv); if (!route->path_rec) route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; goto err1; } ret = cma_query_ib_route(id_priv, timeout_ms, work); if (ret) goto err2; return 0; err2: kfree(route->path_rec); route->path_rec = NULL; err1: kfree(work); return ret; } static enum ib_gid_type cma_route_gid_type(enum rdma_network_type network_type, unsigned long supported_gids, enum ib_gid_type default_gid) { if ((network_type == RDMA_NETWORK_IPV4 || network_type == RDMA_NETWORK_IPV6) && test_bit(IB_GID_TYPE_ROCE_UDP_ENCAP, &supported_gids)) return IB_GID_TYPE_ROCE_UDP_ENCAP; return default_gid; } /* * cma_iboe_set_path_rec_l2_fields() is helper function which sets * path record type based on GID type. * It also sets up other L2 fields which includes destination mac address * netdev ifindex, of the path record. * It returns the netdev of the bound interface for this path record entry. */ static struct net_device * cma_iboe_set_path_rec_l2_fields(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; enum ib_gid_type gid_type = IB_GID_TYPE_ROCE; struct rdma_addr *addr = &route->addr; unsigned long supported_gids; struct net_device *ndev; if (!addr->dev_addr.bound_dev_if) return NULL; ndev = dev_get_by_index(addr->dev_addr.net, addr->dev_addr.bound_dev_if); if (!ndev) return NULL; supported_gids = roce_gid_type_mask_support(id_priv->id.device, id_priv->id.port_num); gid_type = cma_route_gid_type(addr->dev_addr.network, supported_gids, id_priv->gid_type); /* Use the hint from IP Stack to select GID Type */ if (gid_type < ib_network_to_gid_type(addr->dev_addr.network)) gid_type = ib_network_to_gid_type(addr->dev_addr.network); route->path_rec->rec_type = sa_conv_gid_to_pathrec_type(gid_type); route->path_rec->roce.route_resolved = true; sa_path_set_dmac(route->path_rec, addr->dev_addr.dst_dev_addr); return ndev; } int rdma_set_ib_path(struct rdma_cm_id *id, struct sa_path_rec *path_rec) { struct rdma_id_private *id_priv; struct net_device *ndev; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_RESOLVED)) return -EINVAL; id->route.path_rec = kmemdup(path_rec, sizeof(*path_rec), GFP_KERNEL); if (!id->route.path_rec) { ret = -ENOMEM; goto err; } if (rdma_protocol_roce(id->device, id->port_num)) { ndev = cma_iboe_set_path_rec_l2_fields(id_priv); if (!ndev) { ret = -ENODEV; goto err_free; } dev_put(ndev); } id->route.num_pri_alt_paths = 1; return 0; err_free: kfree(id->route.path_rec); id->route.path_rec = NULL; err: cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_ADDR_RESOLVED); return ret; } EXPORT_SYMBOL(rdma_set_ib_path); static int cma_resolve_iw_route(struct rdma_id_private *id_priv) { struct cma_work *work; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; cma_init_resolve_route_work(work, id_priv); queue_work(cma_wq, &work->work); return 0; } static int get_vlan_ndev_tc(struct net_device *vlan_ndev, int prio) { struct net_device *dev; dev = vlan_dev_real_dev(vlan_ndev); if (dev->num_tc) return netdev_get_prio_tc_map(dev, prio); return (vlan_dev_get_egress_qos_mask(vlan_ndev, prio) & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; } struct iboe_prio_tc_map { int input_prio; int output_tc; bool found; }; static int get_lower_vlan_dev_tc(struct net_device *dev, struct netdev_nested_priv *priv) { struct iboe_prio_tc_map *map = (struct iboe_prio_tc_map *)priv->data; if (is_vlan_dev(dev)) map->output_tc = get_vlan_ndev_tc(dev, map->input_prio); else if (dev->num_tc) map->output_tc = netdev_get_prio_tc_map(dev, map->input_prio); else map->output_tc = 0; /* We are interested only in first level VLAN device, so always * return 1 to stop iterating over next level devices. */ map->found = true; return 1; } static int iboe_tos_to_sl(struct net_device *ndev, int tos) { struct iboe_prio_tc_map prio_tc_map = {}; int prio = rt_tos2priority(tos); struct netdev_nested_priv priv; /* If VLAN device, get it directly from the VLAN netdev */ if (is_vlan_dev(ndev)) return get_vlan_ndev_tc(ndev, prio); prio_tc_map.input_prio = prio; priv.data = (void *)&prio_tc_map; rcu_read_lock(); netdev_walk_all_lower_dev_rcu(ndev, get_lower_vlan_dev_tc, &priv); rcu_read_unlock(); /* If map is found from lower device, use it; Otherwise * continue with the current netdevice to get priority to tc map. */ if (prio_tc_map.found) return prio_tc_map.output_tc; else if (ndev->num_tc) return netdev_get_prio_tc_map(ndev, prio); else return 0; } static __be32 cma_get_roce_udp_flow_label(struct rdma_id_private *id_priv) { struct sockaddr_in6 *addr6; u16 dport, sport; u32 hash, fl; addr6 = (struct sockaddr_in6 *)cma_src_addr(id_priv); fl = be32_to_cpu(addr6->sin6_flowinfo) & IB_GRH_FLOWLABEL_MASK; if ((cma_family(id_priv) != AF_INET6) || !fl) { dport = be16_to_cpu(cma_port(cma_dst_addr(id_priv))); sport = be16_to_cpu(cma_port(cma_src_addr(id_priv))); hash = (u32)sport * 31 + dport; fl = hash & IB_GRH_FLOWLABEL_MASK; } return cpu_to_be32(fl); } static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; struct rdma_addr *addr = &route->addr; struct cma_work *work; int ret; struct net_device *ndev; u8 default_roce_tos = id_priv->cma_dev->default_roce_tos[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; u8 tos; mutex_lock(&id_priv->qp_mutex); tos = id_priv->tos_set ? id_priv->tos : default_roce_tos; mutex_unlock(&id_priv->qp_mutex); work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; goto err1; } route->num_pri_alt_paths = 1; ndev = cma_iboe_set_path_rec_l2_fields(id_priv); if (!ndev) { ret = -ENODEV; goto err2; } rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &route->path_rec->sgid); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr, &route->path_rec->dgid); if (((struct sockaddr *)&id_priv->id.route.addr.dst_addr)->sa_family != AF_IB) /* TODO: get the hoplimit from the inet/inet6 device */ route->path_rec->hop_limit = addr->dev_addr.hoplimit; else route->path_rec->hop_limit = 1; route->path_rec->reversible = 1; route->path_rec->pkey = cpu_to_be16(0xffff); route->path_rec->mtu_selector = IB_SA_EQ; route->path_rec->sl = iboe_tos_to_sl(ndev, tos); route->path_rec->traffic_class = tos; route->path_rec->mtu = iboe_get_mtu(ndev->mtu); route->path_rec->rate_selector = IB_SA_EQ; route->path_rec->rate = IB_RATE_PORT_CURRENT; dev_put(ndev); route->path_rec->packet_life_time_selector = IB_SA_EQ; /* In case ACK timeout is set, use this value to calculate * PacketLifeTime. As per IBTA 12.7.34, * local ACK timeout = (2 * PacketLifeTime + Local CA’s ACK delay). * Assuming a negligible local ACK delay, we can use * PacketLifeTime = local ACK timeout/2 * as a reasonable approximation for RoCE networks. */ mutex_lock(&id_priv->qp_mutex); if (id_priv->timeout_set && id_priv->timeout) route->path_rec->packet_life_time = id_priv->timeout - 1; else route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME; mutex_unlock(&id_priv->qp_mutex); if (!route->path_rec->mtu) { ret = -EINVAL; goto err2; } if (rdma_protocol_roce_udp_encap(id_priv->id.device, id_priv->id.port_num)) route->path_rec->flow_label = cma_get_roce_udp_flow_label(id_priv); cma_init_resolve_route_work(work, id_priv); queue_work(cma_wq, &work->work); return 0; err2: kfree(route->path_rec); route->path_rec = NULL; route->num_pri_alt_paths = 0; err1: kfree(work); return ret; } int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms) { struct rdma_id_private *id_priv; int ret; if (!timeout_ms) return -EINVAL; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY)) return -EINVAL; cma_id_get(id_priv); if (rdma_cap_ib_sa(id->device, id->port_num)) ret = cma_resolve_ib_route(id_priv, timeout_ms); else if (rdma_protocol_roce(id->device, id->port_num)) { ret = cma_resolve_iboe_route(id_priv); if (!ret) cma_add_id_to_tree(id_priv); } else if (rdma_protocol_iwarp(id->device, id->port_num)) ret = cma_resolve_iw_route(id_priv); else ret = -ENOSYS; if (ret) goto err; return 0; err: cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED); cma_id_put(id_priv); return ret; } EXPORT_SYMBOL(rdma_resolve_route); static void cma_set_loopback(struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: ((struct sockaddr_in *) addr)->sin_addr.s_addr = htonl(INADDR_LOOPBACK); break; case AF_INET6: ipv6_addr_set(&((struct sockaddr_in6 *) addr)->sin6_addr, 0, 0, 0, htonl(1)); break; default: ib_addr_set(&((struct sockaddr_ib *) addr)->sib_addr, 0, 0, 0, htonl(1)); break; } } static int cma_bind_loopback(struct rdma_id_private *id_priv) { struct cma_device *cma_dev, *cur_dev; union ib_gid gid; enum ib_port_state port_state; unsigned int p; u16 pkey; int ret; cma_dev = NULL; mutex_lock(&lock); list_for_each_entry(cur_dev, &dev_list, list) { if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cur_dev->device, 1)) continue; if (!cma_dev) cma_dev = cur_dev; rdma_for_each_port (cur_dev->device, p) { if (!ib_get_cached_port_state(cur_dev->device, p, &port_state) && port_state == IB_PORT_ACTIVE) { cma_dev = cur_dev; goto port_found; } } } if (!cma_dev) { ret = -ENODEV; goto out; } p = 1; port_found: ret = rdma_query_gid(cma_dev->device, p, 0, &gid); if (ret) goto out; ret = ib_get_cached_pkey(cma_dev->device, p, 0, &pkey); if (ret) goto out; id_priv->id.route.addr.dev_addr.dev_type = (rdma_protocol_ib(cma_dev->device, p)) ? ARPHRD_INFINIBAND : ARPHRD_ETHER; rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid); ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey); id_priv->id.port_num = p; cma_attach_to_dev(id_priv, cma_dev); rdma_restrack_add(&id_priv->res); cma_set_loopback(cma_src_addr(id_priv)); out: mutex_unlock(&lock); return ret; } static void addr_handler(int status, struct sockaddr *src_addr, struct rdma_dev_addr *dev_addr, void *context) { struct rdma_id_private *id_priv = context; struct rdma_cm_event event = {}; struct sockaddr *addr; struct sockaddr_storage old_addr; mutex_lock(&id_priv->handler_mutex); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_RESOLVED)) goto out; /* * Store the previous src address, so that if we fail to acquire * matching rdma device, old address can be restored back, which helps * to cancel the cma listen operation correctly. */ addr = cma_src_addr(id_priv); memcpy(&old_addr, addr, rdma_addr_size(addr)); memcpy(addr, src_addr, rdma_addr_size(src_addr)); if (!status && !id_priv->cma_dev) { status = cma_acquire_dev_by_src_ip(id_priv); if (status) pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to acquire device. status %d\n", status); rdma_restrack_add(&id_priv->res); } else if (status) { pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to resolve IP. status %d\n", status); } if (status) { memcpy(addr, &old_addr, rdma_addr_size((struct sockaddr *)&old_addr)); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ADDR_BOUND)) goto out; event.event = RDMA_CM_EVENT_ADDR_ERROR; event.status = status; } else event.event = RDMA_CM_EVENT_ADDR_RESOLVED; if (cma_cm_event_handler(id_priv, &event)) { destroy_id_handler_unlock(id_priv); return; } out: mutex_unlock(&id_priv->handler_mutex); } static int cma_resolve_loopback(struct rdma_id_private *id_priv) { struct cma_work *work; union ib_gid gid; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; if (!id_priv->cma_dev) { ret = cma_bind_loopback(id_priv); if (ret) goto err; } rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid); rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid); enqueue_resolve_addr_work(work, id_priv); return 0; err: kfree(work); return ret; } static int cma_resolve_ib_addr(struct rdma_id_private *id_priv) { struct cma_work *work; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; if (!id_priv->cma_dev) { ret = cma_resolve_ib_dev(id_priv); if (ret) goto err; } rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, (union ib_gid *) &(((struct sockaddr_ib *) &id_priv->id.route.addr.dst_addr)->sib_addr)); enqueue_resolve_addr_work(work, id_priv); return 0; err: kfree(work); return ret; } int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse) { struct rdma_id_private *id_priv; unsigned long flags; int ret; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irqsave(&id_priv->lock, flags); if ((reuse && id_priv->state != RDMA_CM_LISTEN) || id_priv->state == RDMA_CM_IDLE) { id_priv->reuseaddr = reuse; ret = 0; } else { ret = -EINVAL; } spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } EXPORT_SYMBOL(rdma_set_reuseaddr); int rdma_set_afonly(struct rdma_cm_id *id, int afonly) { struct rdma_id_private *id_priv; unsigned long flags; int ret; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irqsave(&id_priv->lock, flags); if (id_priv->state == RDMA_CM_IDLE || id_priv->state == RDMA_CM_ADDR_BOUND) { id_priv->options |= (1 << CMA_OPTION_AFONLY); id_priv->afonly = afonly; ret = 0; } else { ret = -EINVAL; } spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } EXPORT_SYMBOL(rdma_set_afonly); static void cma_bind_port(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv) { struct sockaddr *addr; struct sockaddr_ib *sib; u64 sid, mask; __be16 port; lockdep_assert_held(&lock); addr = cma_src_addr(id_priv); port = htons(bind_list->port); switch (addr->sa_family) { case AF_INET: ((struct sockaddr_in *) addr)->sin_port = port; break; case AF_INET6: ((struct sockaddr_in6 *) addr)->sin6_port = port; break; case AF_IB: sib = (struct sockaddr_ib *) addr; sid = be64_to_cpu(sib->sib_sid); mask = be64_to_cpu(sib->sib_sid_mask); sib->sib_sid = cpu_to_be64((sid & mask) | (u64) ntohs(port)); sib->sib_sid_mask = cpu_to_be64(~0ULL); break; } id_priv->bind_list = bind_list; hlist_add_head(&id_priv->node, &bind_list->owners); } static int cma_alloc_port(enum rdma_ucm_port_space ps, struct rdma_id_private *id_priv, unsigned short snum) { struct rdma_bind_list *bind_list; int ret; lockdep_assert_held(&lock); bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL); if (!bind_list) return -ENOMEM; ret = cma_ps_alloc(id_priv->id.route.addr.dev_addr.net, ps, bind_list, snum); if (ret < 0) goto err; bind_list->ps = ps; bind_list->port = snum; cma_bind_port(bind_list, id_priv); return 0; err: kfree(bind_list); return ret == -ENOSPC ? -EADDRNOTAVAIL : ret; } static int cma_port_is_unique(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv) { struct rdma_id_private *cur_id; struct sockaddr *daddr = cma_dst_addr(id_priv); struct sockaddr *saddr = cma_src_addr(id_priv); __be16 dport = cma_port(daddr); lockdep_assert_held(&lock); hlist_for_each_entry(cur_id, &bind_list->owners, node) { struct sockaddr *cur_daddr = cma_dst_addr(cur_id); struct sockaddr *cur_saddr = cma_src_addr(cur_id); __be16 cur_dport = cma_port(cur_daddr); if (id_priv == cur_id) continue; /* different dest port -> unique */ if (!cma_any_port(daddr) && !cma_any_port(cur_daddr) && (dport != cur_dport)) continue; /* different src address -> unique */ if (!cma_any_addr(saddr) && !cma_any_addr(cur_saddr) && cma_addr_cmp(saddr, cur_saddr)) continue; /* different dst address -> unique */ if (!cma_any_addr(daddr) && !cma_any_addr(cur_daddr) && cma_addr_cmp(daddr, cur_daddr)) continue; return -EADDRNOTAVAIL; } return 0; } static int cma_alloc_any_port(enum rdma_ucm_port_space ps, struct rdma_id_private *id_priv) { static unsigned int last_used_port; int low, high, remaining; unsigned int rover; struct net *net = id_priv->id.route.addr.dev_addr.net; lockdep_assert_held(&lock); inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; rover = get_random_u32_inclusive(low, remaining + low - 1); retry: if (last_used_port != rover) { struct rdma_bind_list *bind_list; int ret; bind_list = cma_ps_find(net, ps, (unsigned short)rover); if (!bind_list) { ret = cma_alloc_port(ps, id_priv, rover); } else { ret = cma_port_is_unique(bind_list, id_priv); if (!ret) cma_bind_port(bind_list, id_priv); } /* * Remember previously used port number in order to avoid * re-using same port immediately after it is closed. */ if (!ret) last_used_port = rover; if (ret != -EADDRNOTAVAIL) return ret; } if (--remaining) { rover++; if ((rover < low) || (rover > high)) rover = low; goto retry; } return -EADDRNOTAVAIL; } /* * Check that the requested port is available. This is called when trying to * bind to a specific port, or when trying to listen on a bound port. In * the latter case, the provided id_priv may already be on the bind_list, but * we still need to check that it's okay to start listening. */ static int cma_check_port(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv, uint8_t reuseaddr) { struct rdma_id_private *cur_id; struct sockaddr *addr, *cur_addr; lockdep_assert_held(&lock); addr = cma_src_addr(id_priv); hlist_for_each_entry(cur_id, &bind_list->owners, node) { if (id_priv == cur_id) continue; if (reuseaddr && cur_id->reuseaddr) continue; cur_addr = cma_src_addr(cur_id); if (id_priv->afonly && cur_id->afonly && (addr->sa_family != cur_addr->sa_family)) continue; if (cma_any_addr(addr) || cma_any_addr(cur_addr)) return -EADDRNOTAVAIL; if (!cma_addr_cmp(addr, cur_addr)) return -EADDRINUSE; } return 0; } static int cma_use_port(enum rdma_ucm_port_space ps, struct rdma_id_private *id_priv) { struct rdma_bind_list *bind_list; unsigned short snum; int ret; lockdep_assert_held(&lock); snum = ntohs(cma_port(cma_src_addr(id_priv))); if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) return -EACCES; bind_list = cma_ps_find(id_priv->id.route.addr.dev_addr.net, ps, snum); if (!bind_list) { ret = cma_alloc_port(ps, id_priv, snum); } else { ret = cma_check_port(bind_list, id_priv, id_priv->reuseaddr); if (!ret) cma_bind_port(bind_list, id_priv); } return ret; } static enum rdma_ucm_port_space cma_select_inet_ps(struct rdma_id_private *id_priv) { switch (id_priv->id.ps) { case RDMA_PS_TCP: case RDMA_PS_UDP: case RDMA_PS_IPOIB: case RDMA_PS_IB: return id_priv->id.ps; default: return 0; } } static enum rdma_ucm_port_space cma_select_ib_ps(struct rdma_id_private *id_priv) { enum rdma_ucm_port_space ps = 0; struct sockaddr_ib *sib; u64 sid_ps, mask, sid; sib = (struct sockaddr_ib *) cma_src_addr(id_priv); mask = be64_to_cpu(sib->sib_sid_mask) & RDMA_IB_IP_PS_MASK; sid = be64_to_cpu(sib->sib_sid) & mask; if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) { sid_ps = RDMA_IB_IP_PS_IB; ps = RDMA_PS_IB; } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) && (sid == (RDMA_IB_IP_PS_TCP & mask))) { sid_ps = RDMA_IB_IP_PS_TCP; ps = RDMA_PS_TCP; } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) && (sid == (RDMA_IB_IP_PS_UDP & mask))) { sid_ps = RDMA_IB_IP_PS_UDP; ps = RDMA_PS_UDP; } if (ps) { sib->sib_sid = cpu_to_be64(sid_ps | ntohs(cma_port((struct sockaddr *) sib))); sib->sib_sid_mask = cpu_to_be64(RDMA_IB_IP_PS_MASK | be64_to_cpu(sib->sib_sid_mask)); } return ps; } static int cma_get_port(struct rdma_id_private *id_priv) { enum rdma_ucm_port_space ps; int ret; if (cma_family(id_priv) != AF_IB) ps = cma_select_inet_ps(id_priv); else ps = cma_select_ib_ps(id_priv); if (!ps) return -EPROTONOSUPPORT; mutex_lock(&lock); if (cma_any_port(cma_src_addr(id_priv))) ret = cma_alloc_any_port(ps, id_priv); else ret = cma_use_port(ps, id_priv); mutex_unlock(&lock); return ret; } static int cma_check_linklocal(struct rdma_dev_addr *dev_addr, struct sockaddr *addr) { #if IS_ENABLED(CONFIG_IPV6) struct sockaddr_in6 *sin6; if (addr->sa_family != AF_INET6) return 0; sin6 = (struct sockaddr_in6 *) addr; if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) return 0; if (!sin6->sin6_scope_id) return -EINVAL; dev_addr->bound_dev_if = sin6->sin6_scope_id; #endif return 0; } int rdma_listen(struct rdma_cm_id *id, int backlog) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); int ret; if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN)) { struct sockaddr_in any_in = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), }; /* For a well behaved ULP state will be RDMA_CM_IDLE */ ret = rdma_bind_addr(id, (struct sockaddr *)&any_in); if (ret) return ret; if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN))) return -EINVAL; } /* * Once the ID reaches RDMA_CM_LISTEN it is not allowed to be reusable * any more, and has to be unique in the bind list. */ if (id_priv->reuseaddr) { mutex_lock(&lock); ret = cma_check_port(id_priv->bind_list, id_priv, 0); if (!ret) id_priv->reuseaddr = 0; mutex_unlock(&lock); if (ret) goto err; } id_priv->backlog = backlog; if (id_priv->cma_dev) { if (rdma_cap_ib_cm(id->device, 1)) { ret = cma_ib_listen(id_priv); if (ret) goto err; } else if (rdma_cap_iw_cm(id->device, 1)) { ret = cma_iw_listen(id_priv, backlog); if (ret) goto err; } else { ret = -ENOSYS; goto err; } } else { ret = cma_listen_on_all(id_priv); if (ret) goto err; } return 0; err: id_priv->backlog = 0; /* * All the failure paths that lead here will not allow the req_handler's * to have run. */ cma_comp_exch(id_priv, RDMA_CM_LISTEN, RDMA_CM_ADDR_BOUND); return ret; } EXPORT_SYMBOL(rdma_listen); static int rdma_bind_addr_dst(struct rdma_id_private *id_priv, struct sockaddr *addr, const struct sockaddr *daddr) { struct sockaddr *id_daddr; int ret; if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6 && addr->sa_family != AF_IB) return -EAFNOSUPPORT; if (!cma_comp_exch(id_priv, RDMA_CM_IDLE, RDMA_CM_ADDR_BOUND)) return -EINVAL; ret = cma_check_linklocal(&id_priv->id.route.addr.dev_addr, addr); if (ret) goto err1; memcpy(cma_src_addr(id_priv), addr, rdma_addr_size(addr)); if (!cma_any_addr(addr)) { ret = cma_translate_addr(addr, &id_priv->id.route.addr.dev_addr); if (ret) goto err1; ret = cma_acquire_dev_by_src_ip(id_priv); if (ret) goto err1; } if (!(id_priv->options & (1 << CMA_OPTION_AFONLY))) { if (addr->sa_family == AF_INET) id_priv->afonly = 1; #if IS_ENABLED(CONFIG_IPV6) else if (addr->sa_family == AF_INET6) { struct net *net = id_priv->id.route.addr.dev_addr.net; id_priv->afonly = net->ipv6.sysctl.bindv6only; } #endif } id_daddr = cma_dst_addr(id_priv); if (daddr != id_daddr) memcpy(id_daddr, daddr, rdma_addr_size(addr)); id_daddr->sa_family = addr->sa_family; ret = cma_get_port(id_priv); if (ret) goto err2; if (!cma_any_addr(addr)) rdma_restrack_add(&id_priv->res); return 0; err2: if (id_priv->cma_dev) cma_release_dev(id_priv); err1: cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE); return ret; } static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, const struct sockaddr *dst_addr) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); struct sockaddr_storage zero_sock = {}; if (src_addr && src_addr->sa_family) return rdma_bind_addr_dst(id_priv, src_addr, dst_addr); /* * When the src_addr is not specified, automatically supply an any addr */ zero_sock.ss_family = dst_addr->sa_family; if (IS_ENABLED(CONFIG_IPV6) && dst_addr->sa_family == AF_INET6) { struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *)&zero_sock; struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *)dst_addr; src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id; if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL) id->route.addr.dev_addr.bound_dev_if = dst_addr6->sin6_scope_id; } else if (dst_addr->sa_family == AF_IB) { ((struct sockaddr_ib *)&zero_sock)->sib_pkey = ((struct sockaddr_ib *)dst_addr)->sib_pkey; } return rdma_bind_addr_dst(id_priv, (struct sockaddr *)&zero_sock, dst_addr); } /* * If required, resolve the source address for bind and leave the id_priv in * state RDMA_CM_ADDR_BOUND. This oddly uses the state to determine the prior * calls made by ULP, a previously bound ID will not be re-bound and src_addr is * ignored. */ static int resolve_prepare_src(struct rdma_id_private *id_priv, struct sockaddr *src_addr, const struct sockaddr *dst_addr) { int ret; if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) { /* For a well behaved ULP state will be RDMA_CM_IDLE */ ret = cma_bind_addr(&id_priv->id, src_addr, dst_addr); if (ret) return ret; if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY))) return -EINVAL; } else { memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr)); } if (cma_family(id_priv) != dst_addr->sa_family) { ret = -EINVAL; goto err_state; } return 0; err_state: cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); return ret; } int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, const struct sockaddr *dst_addr, unsigned long timeout_ms) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); int ret; ret = resolve_prepare_src(id_priv, src_addr, dst_addr); if (ret) return ret; if (cma_any_addr(dst_addr)) { ret = cma_resolve_loopback(id_priv); } else { if (dst_addr->sa_family == AF_IB) { ret = cma_resolve_ib_addr(id_priv); } else { /* * The FSM can return back to RDMA_CM_ADDR_BOUND after * rdma_resolve_ip() is called, eg through the error * path in addr_handler(). If this happens the existing * request must be canceled before issuing a new one. * Since canceling a request is a bit slow and this * oddball path is rare, keep track once a request has * been issued. The track turns out to be a permanent * state since this is the only cancel as it is * immediately before rdma_resolve_ip(). */ if (id_priv->used_resolve_ip) rdma_addr_cancel(&id->route.addr.dev_addr); else id_priv->used_resolve_ip = 1; ret = rdma_resolve_ip(cma_src_addr(id_priv), dst_addr, &id->route.addr.dev_addr, timeout_ms, addr_handler, false, id_priv); } } if (ret) goto err; return 0; err: cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); return ret; } EXPORT_SYMBOL(rdma_resolve_addr); int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); return rdma_bind_addr_dst(id_priv, addr, cma_dst_addr(id_priv)); } EXPORT_SYMBOL(rdma_bind_addr); static int cma_format_hdr(void *hdr, struct rdma_id_private *id_priv) { struct cma_hdr *cma_hdr; cma_hdr = hdr; cma_hdr->cma_version = CMA_VERSION; if (cma_family(id_priv) == AF_INET) { struct sockaddr_in *src4, *dst4; src4 = (struct sockaddr_in *) cma_src_addr(id_priv); dst4 = (struct sockaddr_in *) cma_dst_addr(id_priv); cma_set_ip_ver(cma_hdr, 4); cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; cma_hdr->port = src4->sin_port; } else if (cma_family(id_priv) == AF_INET6) { struct sockaddr_in6 *src6, *dst6; src6 = (struct sockaddr_in6 *) cma_src_addr(id_priv); dst6 = (struct sockaddr_in6 *) cma_dst_addr(id_priv); cma_set_ip_ver(cma_hdr, 6); cma_hdr->src_addr.ip6 = src6->sin6_addr; cma_hdr->dst_addr.ip6 = dst6->sin6_addr; cma_hdr->port = src6->sin6_port; } return 0; } static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv = cm_id->context; struct rdma_cm_event event = {}; const struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd; int ret; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) goto out; switch (ib_event->event) { case IB_CM_SIDR_REQ_ERROR: event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -ETIMEDOUT; break; case IB_CM_SIDR_REP_RECEIVED: event.param.ud.private_data = ib_event->private_data; event.param.ud.private_data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE; if (rep->status != IB_SIDR_SUCCESS) { event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = ib_event->param.sidr_rep_rcvd.status; pr_debug_ratelimited("RDMA CM: UNREACHABLE: bad SIDR reply. status %d\n", event.status); break; } ret = cma_set_qkey(id_priv, rep->qkey); if (ret) { pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to set qkey. status %d\n", ret); event.event = RDMA_CM_EVENT_ADDR_ERROR; event.status = ret; break; } ib_init_ah_attr_from_path(id_priv->id.device, id_priv->id.port_num, id_priv->id.route.path_rec, &event.param.ud.ah_attr, rep->sgid_attr); event.param.ud.qp_num = rep->qpn; event.param.ud.qkey = rep->qkey; event.event = RDMA_CM_EVENT_ESTABLISHED; event.status = 0; break; default: pr_err("RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); goto out; } ret = cma_cm_event_handler(id_priv, &event); rdma_destroy_ah_attr(&event.param.ud.ah_attr); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; destroy_id_handler_unlock(id_priv); return ret; } out: mutex_unlock(&id_priv->handler_mutex); return 0; } static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_sidr_req_param req; struct ib_cm_id *id; void *private_data; u8 offset; int ret; memset(&req, 0, sizeof req); offset = cma_user_data_offset(id_priv); if (check_add_overflow(offset, conn_param->private_data_len, &req.private_data_len)) return -EINVAL; if (req.private_data_len) { private_data = kzalloc(req.private_data_len, GFP_ATOMIC); if (!private_data) return -ENOMEM; } else { private_data = NULL; } if (conn_param->private_data && conn_param->private_data_len) memcpy(private_data + offset, conn_param->private_data, conn_param->private_data_len); if (private_data) { ret = cma_format_hdr(private_data, id_priv); if (ret) goto out; req.private_data = private_data; } id = ib_create_cm_id(id_priv->id.device, cma_sidr_rep_handler, id_priv); if (IS_ERR(id)) { ret = PTR_ERR(id); goto out; } id_priv->cm_id.ib = id; req.path = id_priv->id.route.path_rec; req.sgid_attr = id_priv->id.route.addr.dev_addr.sgid_attr; req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8); req.max_cm_retries = CMA_MAX_CM_RETRIES; trace_cm_send_sidr_req(id_priv); ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req); if (ret) { ib_destroy_cm_id(id_priv->cm_id.ib); id_priv->cm_id.ib = NULL; } out: kfree(private_data); return ret; } static int cma_connect_ib(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_req_param req; struct rdma_route *route; void *private_data; struct ib_cm_id *id; u8 offset; int ret; memset(&req, 0, sizeof req); offset = cma_user_data_offset(id_priv); if (check_add_overflow(offset, conn_param->private_data_len, &req.private_data_len)) return -EINVAL; if (req.private_data_len) { private_data = kzalloc(req.private_data_len, GFP_ATOMIC); if (!private_data) return -ENOMEM; } else { private_data = NULL; } if (conn_param->private_data && conn_param->private_data_len) memcpy(private_data + offset, conn_param->private_data, conn_param->private_data_len); id = ib_create_cm_id(id_priv->id.device, cma_ib_handler, id_priv); if (IS_ERR(id)) { ret = PTR_ERR(id); goto out; } id_priv->cm_id.ib = id; route = &id_priv->id.route; if (private_data) { ret = cma_format_hdr(private_data, id_priv); if (ret) goto out; req.private_data = private_data; } req.primary_path = &route->path_rec[0]; req.primary_path_inbound = route->path_rec_inbound; req.primary_path_outbound = route->path_rec_outbound; if (route->num_pri_alt_paths == 2) req.alternate_path = &route->path_rec[1]; req.ppath_sgid_attr = id_priv->id.route.addr.dev_addr.sgid_attr; /* Alternate path SGID attribute currently unsupported */ req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); req.qp_num = id_priv->qp_num; req.qp_type = id_priv->id.qp_type; req.starting_psn = id_priv->seq_num; req.responder_resources = conn_param->responder_resources; req.initiator_depth = conn_param->initiator_depth; req.flow_control = conn_param->flow_control; req.retry_count = min_t(u8, 7, conn_param->retry_count); req.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); req.remote_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; req.local_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; req.max_cm_retries = CMA_MAX_CM_RETRIES; req.srq = id_priv->srq ? 1 : 0; req.ece.vendor_id = id_priv->ece.vendor_id; req.ece.attr_mod = id_priv->ece.attr_mod; trace_cm_send_req(id_priv); ret = ib_send_cm_req(id_priv->cm_id.ib, &req); out: if (ret && !IS_ERR(id)) { ib_destroy_cm_id(id); id_priv->cm_id.ib = NULL; } kfree(private_data); return ret; } static int cma_connect_iw(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct iw_cm_id *cm_id; int ret; struct iw_cm_conn_param iw_param; cm_id = iw_create_cm_id(id_priv->id.device, cma_iw_handler, id_priv); if (IS_ERR(cm_id)) return PTR_ERR(cm_id); mutex_lock(&id_priv->qp_mutex); cm_id->tos = id_priv->tos; cm_id->tos_set = id_priv->tos_set; mutex_unlock(&id_priv->qp_mutex); id_priv->cm_id.iw = cm_id; memcpy(&cm_id->local_addr, cma_src_addr(id_priv), rdma_addr_size(cma_src_addr(id_priv))); memcpy(&cm_id->remote_addr, cma_dst_addr(id_priv), rdma_addr_size(cma_dst_addr(id_priv))); ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) goto out; if (conn_param) { iw_param.ord = conn_param->initiator_depth; iw_param.ird = conn_param->responder_resources; iw_param.private_data = conn_param->private_data; iw_param.private_data_len = conn_param->private_data_len; iw_param.qpn = id_priv->id.qp ? id_priv->qp_num : conn_param->qp_num; } else { memset(&iw_param, 0, sizeof iw_param); iw_param.qpn = id_priv->qp_num; } ret = iw_cm_connect(cm_id, &iw_param); out: if (ret) { iw_destroy_cm_id(cm_id); id_priv->cm_id.iw = NULL; } return ret; } /** * rdma_connect_locked - Initiate an active connection request. * @id: Connection identifier to connect. * @conn_param: Connection information used for connected QPs. * * Same as rdma_connect() but can only be called from the * RDMA_CM_EVENT_ROUTE_RESOLVED handler callback. */ int rdma_connect_locked(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); int ret; if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT)) return -EINVAL; if (!id->qp) { id_priv->qp_num = conn_param->qp_num; id_priv->srq = conn_param->srq; } if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD) ret = cma_resolve_ib_udp(id_priv, conn_param); else ret = cma_connect_ib(id_priv, conn_param); } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = cma_connect_iw(id_priv, conn_param); } else { ret = -ENOSYS; } if (ret) goto err_state; return 0; err_state: cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_ROUTE_RESOLVED); return ret; } EXPORT_SYMBOL(rdma_connect_locked); /** * rdma_connect - Initiate an active connection request. * @id: Connection identifier to connect. * @conn_param: Connection information used for connected QPs. * * Users must have resolved a route for the rdma_cm_id to connect with by having * called rdma_resolve_route before calling this routine. * * This call will either connect to a remote QP or obtain remote QP information * for unconnected rdma_cm_id's. The actual operation is based on the * rdma_cm_id's port space. */ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); int ret; mutex_lock(&id_priv->handler_mutex); ret = rdma_connect_locked(id, conn_param); mutex_unlock(&id_priv->handler_mutex); return ret; } EXPORT_SYMBOL(rdma_connect); /** * rdma_connect_ece - Initiate an active connection request with ECE data. * @id: Connection identifier to connect. * @conn_param: Connection information used for connected QPs. * @ece: ECE parameters * * See rdma_connect() explanation. */ int rdma_connect_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, struct rdma_ucm_ece *ece) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); id_priv->ece.vendor_id = ece->vendor_id; id_priv->ece.attr_mod = ece->attr_mod; return rdma_connect(id, conn_param); } EXPORT_SYMBOL(rdma_connect_ece); static int cma_accept_ib(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_rep_param rep; int ret; ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) goto out; ret = cma_modify_qp_rts(id_priv, conn_param); if (ret) goto out; memset(&rep, 0, sizeof rep); rep.qp_num = id_priv->qp_num; rep.starting_psn = id_priv->seq_num; rep.private_data = conn_param->private_data; rep.private_data_len = conn_param->private_data_len; rep.responder_resources = conn_param->responder_resources; rep.initiator_depth = conn_param->initiator_depth; rep.failover_accepted = 0; rep.flow_control = conn_param->flow_control; rep.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); rep.srq = id_priv->srq ? 1 : 0; rep.ece.vendor_id = id_priv->ece.vendor_id; rep.ece.attr_mod = id_priv->ece.attr_mod; trace_cm_send_rep(id_priv); ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep); out: return ret; } static int cma_accept_iw(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct iw_cm_conn_param iw_param; int ret; if (!conn_param) return -EINVAL; ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) return ret; iw_param.ord = conn_param->initiator_depth; iw_param.ird = conn_param->responder_resources; iw_param.private_data = conn_param->private_data; iw_param.private_data_len = conn_param->private_data_len; if (id_priv->id.qp) iw_param.qpn = id_priv->qp_num; else iw_param.qpn = conn_param->qp_num; return iw_cm_accept(id_priv->cm_id.iw, &iw_param); } static int cma_send_sidr_rep(struct rdma_id_private *id_priv, enum ib_cm_sidr_status status, u32 qkey, const void *private_data, int private_data_len) { struct ib_cm_sidr_rep_param rep; int ret; memset(&rep, 0, sizeof rep); rep.status = status; if (status == IB_SIDR_SUCCESS) { if (qkey) ret = cma_set_qkey(id_priv, qkey); else ret = cma_set_default_qkey(id_priv); if (ret) return ret; rep.qp_num = id_priv->qp_num; rep.qkey = id_priv->qkey; rep.ece.vendor_id = id_priv->ece.vendor_id; rep.ece.attr_mod = id_priv->ece.attr_mod; } rep.private_data = private_data; rep.private_data_len = private_data_len; trace_cm_send_sidr_rep(id_priv); return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep); } /** * rdma_accept - Called to accept a connection request or response. * @id: Connection identifier associated with the request. * @conn_param: Information needed to establish the connection. This must be * provided if accepting a connection request. If accepting a connection * response, this parameter must be NULL. * * Typically, this routine is only called by the listener to accept a connection * request. It must also be called on the active side of a connection if the * user is performing their own QP transitions. * * In the case of error, a reject message is sent to the remote side and the * state of the qp associated with the id is modified to error, such that any * previously posted receive buffers would be flushed. * * This function is for use by kernel ULPs and must be called from under the * handler callback. */ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); int ret; lockdep_assert_held(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) return -EINVAL; if (!id->qp && conn_param) { id_priv->qp_num = conn_param->qp_num; id_priv->srq = conn_param->srq; } if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD) { if (conn_param) ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, conn_param->qkey, conn_param->private_data, conn_param->private_data_len); else ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, 0, NULL, 0); } else { if (conn_param) ret = cma_accept_ib(id_priv, conn_param); else ret = cma_rep_recv(id_priv); } } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = cma_accept_iw(id_priv, conn_param); } else { ret = -ENOSYS; } if (ret) goto reject; return 0; reject: cma_modify_qp_err(id_priv); rdma_reject(id, NULL, 0, IB_CM_REJ_CONSUMER_DEFINED); return ret; } EXPORT_SYMBOL(rdma_accept); int rdma_accept_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, struct rdma_ucm_ece *ece) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); id_priv->ece.vendor_id = ece->vendor_id; id_priv->ece.attr_mod = ece->attr_mod; return rdma_accept(id, conn_param); } EXPORT_SYMBOL(rdma_accept_ece); void rdma_lock_handler(struct rdma_cm_id *id) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->handler_mutex); } EXPORT_SYMBOL(rdma_lock_handler); void rdma_unlock_handler(struct rdma_cm_id *id) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); mutex_unlock(&id_priv->handler_mutex); } EXPORT_SYMBOL(rdma_unlock_handler); int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL; switch (id->device->node_type) { case RDMA_NODE_IB_CA: ret = ib_cm_notify(id_priv->cm_id.ib, event); break; default: ret = 0; break; } return ret; } EXPORT_SYMBOL(rdma_notify); int rdma_reject(struct rdma_cm_id *id, const void *private_data, u8 private_data_len, u8 reason) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL; if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD) { ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, 0, private_data, private_data_len); } else { trace_cm_send_rej(id_priv); ret = ib_send_cm_rej(id_priv->cm_id.ib, reason, NULL, 0, private_data, private_data_len); } } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = iw_cm_reject(id_priv->cm_id.iw, private_data, private_data_len); } else { ret = -ENOSYS; } return ret; } EXPORT_SYMBOL(rdma_reject); int rdma_disconnect(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL; if (rdma_cap_ib_cm(id->device, id->port_num)) { ret = cma_modify_qp_err(id_priv); if (ret) goto out; /* Initiate or respond to a disconnect. */ trace_cm_disconnect(id_priv); if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) { if (!ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0)) trace_cm_sent_drep(id_priv); } else { trace_cm_sent_dreq(id_priv); } } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = iw_cm_disconnect(id_priv->cm_id.iw, 0); } else ret = -EINVAL; out: return ret; } EXPORT_SYMBOL(rdma_disconnect); static void cma_make_mc_event(int status, struct rdma_id_private *id_priv, struct ib_sa_multicast *multicast, struct rdma_cm_event *event, struct cma_multicast *mc) { struct rdma_dev_addr *dev_addr; enum ib_gid_type gid_type; struct net_device *ndev; if (status) pr_debug_ratelimited("RDMA CM: MULTICAST_ERROR: failed to join multicast. status %d\n", status); event->status = status; event->param.ud.private_data = mc->context; if (status) { event->event = RDMA_CM_EVENT_MULTICAST_ERROR; return; } dev_addr = &id_priv->id.route.addr.dev_addr; ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); gid_type = id_priv->cma_dev ->default_gid_type[id_priv->id.port_num - rdma_start_port( id_priv->cma_dev->device)]; event->event = RDMA_CM_EVENT_MULTICAST_JOIN; if (ib_init_ah_from_mcmember(id_priv->id.device, id_priv->id.port_num, &multicast->rec, ndev, gid_type, &event->param.ud.ah_attr)) { event->event = RDMA_CM_EVENT_MULTICAST_ERROR; goto out; } event->param.ud.qp_num = 0xFFFFFF; event->param.ud.qkey = id_priv->qkey; out: dev_put(ndev); } static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) { struct cma_multicast *mc = multicast->context; struct rdma_id_private *id_priv = mc->id_priv; struct rdma_cm_event event = {}; int ret = 0; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL || READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING) goto out; ret = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey)); if (!ret) { cma_make_mc_event(status, id_priv, multicast, &event, mc); ret = cma_cm_event_handler(id_priv, &event); } rdma_destroy_ah_attr(&event.param.ud.ah_attr); WARN_ON(ret); out: mutex_unlock(&id_priv->handler_mutex); return 0; } static void cma_set_mgid(struct rdma_id_private *id_priv, struct sockaddr *addr, union ib_gid *mgid) { unsigned char mc_map[MAX_ADDR_LEN]; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct sockaddr_in *sin = (struct sockaddr_in *) addr; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) addr; if (cma_any_addr(addr)) { memset(mgid, 0, sizeof *mgid); } else if ((addr->sa_family == AF_INET6) && ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFFF0FFFF) == 0xFF10A01B)) { /* IPv6 address is an SA assigned MGID. */ memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); } else if (addr->sa_family == AF_IB) { memcpy(mgid, &((struct sockaddr_ib *) addr)->sib_addr, sizeof *mgid); } else if (addr->sa_family == AF_INET6) { ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) mc_map[7] = 0x01; /* Use RDMA CM signature */ *mgid = *(union ib_gid *) (mc_map + 4); } else { ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) mc_map[7] = 0x01; /* Use RDMA CM signature */ *mgid = *(union ib_gid *) (mc_map + 4); } } static int cma_join_ib_multicast(struct rdma_id_private *id_priv, struct cma_multicast *mc) { struct ib_sa_mcmember_rec rec; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; ib_sa_comp_mask comp_mask; int ret; ib_addr_get_mgid(dev_addr, &rec.mgid); ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num, &rec.mgid, &rec); if (ret) return ret; if (!id_priv->qkey) { ret = cma_set_default_qkey(id_priv); if (ret) return ret; } cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid); rec.qkey = cpu_to_be32(id_priv->qkey); rdma_addr_get_sgid(dev_addr, &rec.port_gid); rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); rec.join_state = mc->join_state; comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE | IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_SL | IB_SA_MCMEMBER_REC_FLOW_LABEL | IB_SA_MCMEMBER_REC_TRAFFIC_CLASS; if (id_priv->id.ps == RDMA_PS_IPOIB) comp_mask |= IB_SA_MCMEMBER_REC_RATE | IB_SA_MCMEMBER_REC_RATE_SELECTOR | IB_SA_MCMEMBER_REC_MTU_SELECTOR | IB_SA_MCMEMBER_REC_MTU | IB_SA_MCMEMBER_REC_HOP_LIMIT; mc->sa_mc = ib_sa_join_multicast(&sa_client, id_priv->id.device, id_priv->id.port_num, &rec, comp_mask, GFP_KERNEL, cma_ib_mc_handler, mc); return PTR_ERR_OR_ZERO(mc->sa_mc); } static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, enum ib_gid_type gid_type) { struct sockaddr_in *sin = (struct sockaddr_in *)addr; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; if (cma_any_addr(addr)) { memset(mgid, 0, sizeof *mgid); } else if (addr->sa_family == AF_INET6) { memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); } else { mgid->raw[0] = (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0xff; mgid->raw[1] = (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0x0e; mgid->raw[2] = 0; mgid->raw[3] = 0; mgid->raw[4] = 0; mgid->raw[5] = 0; mgid->raw[6] = 0; mgid->raw[7] = 0; mgid->raw[8] = 0; mgid->raw[9] = 0; mgid->raw[10] = 0xff; mgid->raw[11] = 0xff; *(__be32 *)(&mgid->raw[12]) = sin->sin_addr.s_addr; } } static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, struct cma_multicast *mc) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; int err = 0; struct sockaddr *addr = (struct sockaddr *)&mc->addr; struct net_device *ndev = NULL; struct ib_sa_multicast ib = {}; enum ib_gid_type gid_type; bool send_only; send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN); if (cma_zero_addr(addr)) return -EINVAL; gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; cma_iboe_set_mgid(addr, &ib.rec.mgid, gid_type); ib.rec.pkey = cpu_to_be16(0xffff); if (dev_addr->bound_dev_if) ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (!ndev) return -ENODEV; ib.rec.rate = IB_RATE_PORT_CURRENT; ib.rec.hop_limit = 1; ib.rec.mtu = iboe_get_mtu(ndev->mtu); if (addr->sa_family == AF_INET) { if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { ib.rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; if (!send_only) { err = cma_igmp_send(ndev, &ib.rec.mgid, true); } } } else { if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) err = -ENOTSUPP; } dev_put(ndev); if (err || !ib.rec.mtu) return err ?: -EINVAL; if (!id_priv->qkey) cma_set_default_qkey(id_priv); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &ib.rec.port_gid); INIT_WORK(&mc->iboe_join.work, cma_iboe_join_work_handler); cma_make_mc_event(0, id_priv, &ib, &mc->iboe_join.event, mc); queue_work(cma_wq, &mc->iboe_join.work); return 0; } int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, u8 join_state, void *context) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); struct cma_multicast *mc; int ret; /* Not supported for kernel QPs */ if (WARN_ON(id->qp)) return -EINVAL; /* ULP is calling this wrong. */ if (!id->device || (READ_ONCE(id_priv->state) != RDMA_CM_ADDR_BOUND && READ_ONCE(id_priv->state) != RDMA_CM_ADDR_RESOLVED)) return -EINVAL; if (id_priv->id.qp_type != IB_QPT_UD) return -EINVAL; mc = kzalloc(sizeof(*mc), GFP_KERNEL); if (!mc) return -ENOMEM; memcpy(&mc->addr, addr, rdma_addr_size(addr)); mc->context = context; mc->id_priv = id_priv; mc->join_state = join_state; if (rdma_protocol_roce(id->device, id->port_num)) { ret = cma_iboe_join_multicast(id_priv, mc); if (ret) goto out_err; } else if (rdma_cap_ib_mcast(id->device, id->port_num)) { ret = cma_join_ib_multicast(id_priv, mc); if (ret) goto out_err; } else { ret = -ENOSYS; goto out_err; } spin_lock(&id_priv->lock); list_add(&mc->list, &id_priv->mc_list); spin_unlock(&id_priv->lock); return 0; out_err: kfree(mc); return ret; } EXPORT_SYMBOL(rdma_join_multicast); void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) { struct rdma_id_private *id_priv; struct cma_multicast *mc; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irq(&id_priv->lock); list_for_each_entry(mc, &id_priv->mc_list, list) { if (memcmp(&mc->addr, addr, rdma_addr_size(addr)) != 0) continue; list_del(&mc->list); spin_unlock_irq(&id_priv->lock); WARN_ON(id_priv->cma_dev->device != id->device); destroy_mc(id_priv, mc); return; } spin_unlock_irq(&id_priv->lock); } EXPORT_SYMBOL(rdma_leave_multicast); static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr; struct cma_work *work; dev_addr = &id_priv->id.route.addr.dev_addr; if ((dev_addr->bound_dev_if == ndev->ifindex) && (net_eq(dev_net(ndev), dev_addr->net)) && memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) { pr_info("RDMA CM addr change for ndev %s used by id %p\n", ndev->name, &id_priv->id); work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; INIT_WORK(&work->work, cma_work_handler); work->id = id_priv; work->event.event = RDMA_CM_EVENT_ADDR_CHANGE; cma_id_get(id_priv); queue_work(cma_wq, &work->work); } return 0; } static int cma_netdev_callback(struct notifier_block *self, unsigned long event, void *ptr) { struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct cma_device *cma_dev; struct rdma_id_private *id_priv; int ret = NOTIFY_DONE; if (event != NETDEV_BONDING_FAILOVER) return NOTIFY_DONE; if (!netif_is_bond_master(ndev)) return NOTIFY_DONE; mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) list_for_each_entry(id_priv, &cma_dev->id_list, device_item) { ret = cma_netdev_change(ndev, id_priv); if (ret) goto out; } out: mutex_unlock(&lock); return ret; } static void cma_netevent_work_handler(struct work_struct *_work) { struct rdma_id_private *id_priv = container_of(_work, struct rdma_id_private, id.net_work); struct rdma_cm_event event = {}; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) goto out_unlock; event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -ETIMEDOUT; if (cma_cm_event_handler(id_priv, &event)) { __acquire(&id_priv->handler_mutex); id_priv->cm_id.ib = NULL; cma_id_put(id_priv); destroy_id_handler_unlock(id_priv); return; } out_unlock: mutex_unlock(&id_priv->handler_mutex); cma_id_put(id_priv); } static int cma_netevent_callback(struct notifier_block *self, unsigned long event, void *ctx) { struct id_table_entry *ips_node = NULL; struct rdma_id_private *current_id; struct neighbour *neigh = ctx; unsigned long flags; if (event != NETEVENT_NEIGH_UPDATE) return NOTIFY_DONE; spin_lock_irqsave(&id_table_lock, flags); if (neigh->tbl->family == AF_INET6) { struct sockaddr_in6 neigh_sock_6; neigh_sock_6.sin6_family = AF_INET6; neigh_sock_6.sin6_addr = *(struct in6_addr *)neigh->primary_key; ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex, (struct sockaddr *)&neigh_sock_6); } else if (neigh->tbl->family == AF_INET) { struct sockaddr_in neigh_sock_4; neigh_sock_4.sin_family = AF_INET; neigh_sock_4.sin_addr.s_addr = *(__be32 *)(neigh->primary_key); ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex, (struct sockaddr *)&neigh_sock_4); } else goto out; if (!ips_node) goto out; list_for_each_entry(current_id, &ips_node->id_list, id_list_entry) { if (!memcmp(current_id->id.route.addr.dev_addr.dst_dev_addr, neigh->ha, ETH_ALEN)) continue; INIT_WORK(¤t_id->id.net_work, cma_netevent_work_handler); cma_id_get(current_id); queue_work(cma_wq, ¤t_id->id.net_work); } out: spin_unlock_irqrestore(&id_table_lock, flags); return NOTIFY_DONE; } static struct notifier_block cma_nb = { .notifier_call = cma_netdev_callback }; static struct notifier_block cma_netevent_cb = { .notifier_call = cma_netevent_callback }; static void cma_send_device_removal_put(struct rdma_id_private *id_priv) { struct rdma_cm_event event = { .event = RDMA_CM_EVENT_DEVICE_REMOVAL }; enum rdma_cm_state state; unsigned long flags; mutex_lock(&id_priv->handler_mutex); /* Record that we want to remove the device */ spin_lock_irqsave(&id_priv->lock, flags); state = id_priv->state; if (state == RDMA_CM_DESTROYING || state == RDMA_CM_DEVICE_REMOVAL) { spin_unlock_irqrestore(&id_priv->lock, flags); mutex_unlock(&id_priv->handler_mutex); cma_id_put(id_priv); return; } id_priv->state = RDMA_CM_DEVICE_REMOVAL; spin_unlock_irqrestore(&id_priv->lock, flags); if (cma_cm_event_handler(id_priv, &event)) { /* * At this point the ULP promises it won't call * rdma_destroy_id() concurrently */ cma_id_put(id_priv); mutex_unlock(&id_priv->handler_mutex); trace_cm_id_destroy(id_priv); _destroy_id(id_priv, state); return; } mutex_unlock(&id_priv->handler_mutex); /* * If this races with destroy then the thread that first assigns state * to a destroying does the cancel. */ cma_cancel_operation(id_priv, state); cma_id_put(id_priv); } static void cma_process_remove(struct cma_device *cma_dev) { mutex_lock(&lock); while (!list_empty(&cma_dev->id_list)) { struct rdma_id_private *id_priv = list_first_entry( &cma_dev->id_list, struct rdma_id_private, device_item); list_del_init(&id_priv->listen_item); list_del_init(&id_priv->device_item); cma_id_get(id_priv); mutex_unlock(&lock); cma_send_device_removal_put(id_priv); mutex_lock(&lock); } mutex_unlock(&lock); cma_dev_put(cma_dev); wait_for_completion(&cma_dev->comp); } static bool cma_supported(struct ib_device *device) { u32 i; rdma_for_each_port(device, i) { if (rdma_cap_ib_cm(device, i) || rdma_cap_iw_cm(device, i)) return true; } return false; } static int cma_add_one(struct ib_device *device) { struct rdma_id_private *to_destroy; struct cma_device *cma_dev; struct rdma_id_private *id_priv; unsigned long supported_gids = 0; int ret; u32 i; if (!cma_supported(device)) return -EOPNOTSUPP; cma_dev = kmalloc(sizeof(*cma_dev), GFP_KERNEL); if (!cma_dev) return -ENOMEM; cma_dev->device = device; cma_dev->default_gid_type = kcalloc(device->phys_port_cnt, sizeof(*cma_dev->default_gid_type), GFP_KERNEL); if (!cma_dev->default_gid_type) { ret = -ENOMEM; goto free_cma_dev; } cma_dev->default_roce_tos = kcalloc(device->phys_port_cnt, sizeof(*cma_dev->default_roce_tos), GFP_KERNEL); if (!cma_dev->default_roce_tos) { ret = -ENOMEM; goto free_gid_type; } rdma_for_each_port (device, i) { supported_gids = roce_gid_type_mask_support(device, i); WARN_ON(!supported_gids); if (supported_gids & (1 << CMA_PREFERRED_ROCE_GID_TYPE)) cma_dev->default_gid_type[i - rdma_start_port(device)] = CMA_PREFERRED_ROCE_GID_TYPE; else cma_dev->default_gid_type[i - rdma_start_port(device)] = find_first_bit(&supported_gids, BITS_PER_LONG); cma_dev->default_roce_tos[i - rdma_start_port(device)] = 0; } init_completion(&cma_dev->comp); refcount_set(&cma_dev->refcount, 1); INIT_LIST_HEAD(&cma_dev->id_list); ib_set_client_data(device, &cma_client, cma_dev); mutex_lock(&lock); list_add_tail(&cma_dev->list, &dev_list); list_for_each_entry(id_priv, &listen_any_list, listen_any_item) { ret = cma_listen_on_dev(id_priv, cma_dev, &to_destroy); if (ret) goto free_listen; } mutex_unlock(&lock); trace_cm_add_one(device); return 0; free_listen: list_del(&cma_dev->list); mutex_unlock(&lock); /* cma_process_remove() will delete to_destroy */ cma_process_remove(cma_dev); kfree(cma_dev->default_roce_tos); free_gid_type: kfree(cma_dev->default_gid_type); free_cma_dev: kfree(cma_dev); return ret; } static void cma_remove_one(struct ib_device *device, void *client_data) { struct cma_device *cma_dev = client_data; trace_cm_remove_one(device); mutex_lock(&lock); list_del(&cma_dev->list); mutex_unlock(&lock); cma_process_remove(cma_dev); kfree(cma_dev->default_roce_tos); kfree(cma_dev->default_gid_type); kfree(cma_dev); } static int cma_init_net(struct net *net) { struct cma_pernet *pernet = cma_pernet(net); xa_init(&pernet->tcp_ps); xa_init(&pernet->udp_ps); xa_init(&pernet->ipoib_ps); xa_init(&pernet->ib_ps); return 0; } static void cma_exit_net(struct net *net) { struct cma_pernet *pernet = cma_pernet(net); WARN_ON(!xa_empty(&pernet->tcp_ps)); WARN_ON(!xa_empty(&pernet->udp_ps)); WARN_ON(!xa_empty(&pernet->ipoib_ps)); WARN_ON(!xa_empty(&pernet->ib_ps)); } static struct pernet_operations cma_pernet_operations = { .init = cma_init_net, .exit = cma_exit_net, .id = &cma_pernet_id, .size = sizeof(struct cma_pernet), }; static int __init cma_init(void) { int ret; /* * There is a rare lock ordering dependency in cma_netdev_callback() * that only happens when bonding is enabled. Teach lockdep that rtnl * must never be nested under lock so it can find these without having * to test with bonding. */ if (IS_ENABLED(CONFIG_LOCKDEP)) { rtnl_lock(); mutex_lock(&lock); mutex_unlock(&lock); rtnl_unlock(); } cma_wq = alloc_ordered_workqueue("rdma_cm", WQ_MEM_RECLAIM); if (!cma_wq) return -ENOMEM; ret = register_pernet_subsys(&cma_pernet_operations); if (ret) goto err_wq; ib_sa_register_client(&sa_client); register_netdevice_notifier(&cma_nb); register_netevent_notifier(&cma_netevent_cb); ret = ib_register_client(&cma_client); if (ret) goto err; ret = cma_configfs_init(); if (ret) goto err_ib; return 0; err_ib: ib_unregister_client(&cma_client); err: unregister_netevent_notifier(&cma_netevent_cb); unregister_netdevice_notifier(&cma_nb); ib_sa_unregister_client(&sa_client); unregister_pernet_subsys(&cma_pernet_operations); err_wq: destroy_workqueue(cma_wq); return ret; } static void __exit cma_cleanup(void) { cma_configfs_exit(); ib_unregister_client(&cma_client); unregister_netevent_notifier(&cma_netevent_cb); unregister_netdevice_notifier(&cma_nb); ib_sa_unregister_client(&sa_client); unregister_pernet_subsys(&cma_pernet_operations); destroy_workqueue(cma_wq); } module_init(cma_init); module_exit(cma_cleanup); |
2 3 3 3 3 3 3 1 3 2 1 3 3 3 3 1 1 1 15 15 15 15 15 15 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 | // SPDX-License-Identifier: GPL-2.0-or-later #include <crypto/hash.h> #include <linux/cpu.h> #include <linux/kref.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/percpu.h> #include <linux/workqueue.h> #include <net/tcp.h> static size_t __scratch_size; static DEFINE_PER_CPU(void __rcu *, sigpool_scratch); struct sigpool_entry { struct crypto_ahash *hash; const char *alg; struct kref kref; uint16_t needs_key:1, reserved:15; }; #define CPOOL_SIZE (PAGE_SIZE / sizeof(struct sigpool_entry)) static struct sigpool_entry cpool[CPOOL_SIZE]; static unsigned int cpool_populated; static DEFINE_MUTEX(cpool_mutex); /* Slow-path */ struct scratches_to_free { struct rcu_head rcu; unsigned int cnt; void *scratches[]; }; static void free_old_scratches(struct rcu_head *head) { struct scratches_to_free *stf; stf = container_of(head, struct scratches_to_free, rcu); while (stf->cnt--) kfree(stf->scratches[stf->cnt]); kfree(stf); } /** * sigpool_reserve_scratch - re-allocates scratch buffer, slow-path * @size: request size for the scratch/temp buffer */ static int sigpool_reserve_scratch(size_t size) { struct scratches_to_free *stf; size_t stf_sz = struct_size(stf, scratches, num_possible_cpus()); int cpu, err = 0; lockdep_assert_held(&cpool_mutex); if (__scratch_size >= size) return 0; stf = kmalloc(stf_sz, GFP_KERNEL); if (!stf) return -ENOMEM; stf->cnt = 0; size = max(size, __scratch_size); cpus_read_lock(); for_each_possible_cpu(cpu) { void *scratch, *old_scratch; scratch = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); if (!scratch) { err = -ENOMEM; break; } old_scratch = rcu_replace_pointer(per_cpu(sigpool_scratch, cpu), scratch, lockdep_is_held(&cpool_mutex)); if (!cpu_online(cpu) || !old_scratch) { kfree(old_scratch); continue; } stf->scratches[stf->cnt++] = old_scratch; } cpus_read_unlock(); if (!err) __scratch_size = size; call_rcu(&stf->rcu, free_old_scratches); return err; } static void sigpool_scratch_free(void) { int cpu; for_each_possible_cpu(cpu) kfree(rcu_replace_pointer(per_cpu(sigpool_scratch, cpu), NULL, lockdep_is_held(&cpool_mutex))); __scratch_size = 0; } static int __cpool_try_clone(struct crypto_ahash *hash) { struct crypto_ahash *tmp; tmp = crypto_clone_ahash(hash); if (IS_ERR(tmp)) return PTR_ERR(tmp); crypto_free_ahash(tmp); return 0; } static int __cpool_alloc_ahash(struct sigpool_entry *e, const char *alg) { struct crypto_ahash *cpu0_hash; int ret; e->alg = kstrdup(alg, GFP_KERNEL); if (!e->alg) return -ENOMEM; cpu0_hash = crypto_alloc_ahash(alg, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(cpu0_hash)) { ret = PTR_ERR(cpu0_hash); goto out_free_alg; } e->needs_key = crypto_ahash_get_flags(cpu0_hash) & CRYPTO_TFM_NEED_KEY; ret = __cpool_try_clone(cpu0_hash); if (ret) goto out_free_cpu0_hash; e->hash = cpu0_hash; kref_init(&e->kref); return 0; out_free_cpu0_hash: crypto_free_ahash(cpu0_hash); out_free_alg: kfree(e->alg); e->alg = NULL; return ret; } /** * tcp_sigpool_alloc_ahash - allocates pool for ahash requests * @alg: name of async hash algorithm * @scratch_size: reserve a tcp_sigpool::scratch buffer of this size */ int tcp_sigpool_alloc_ahash(const char *alg, size_t scratch_size) { int i, ret; /* slow-path */ mutex_lock(&cpool_mutex); ret = sigpool_reserve_scratch(scratch_size); if (ret) goto out; for (i = 0; i < cpool_populated; i++) { if (!cpool[i].alg) continue; if (strcmp(cpool[i].alg, alg)) continue; /* pairs with tcp_sigpool_release() */ if (!kref_get_unless_zero(&cpool[i].kref)) kref_init(&cpool[i].kref); ret = i; goto out; } for (i = 0; i < cpool_populated; i++) { if (!cpool[i].alg) break; } if (i >= CPOOL_SIZE) { ret = -ENOSPC; goto out; } ret = __cpool_alloc_ahash(&cpool[i], alg); if (!ret) { ret = i; if (i == cpool_populated) cpool_populated++; } out: mutex_unlock(&cpool_mutex); return ret; } EXPORT_SYMBOL_GPL(tcp_sigpool_alloc_ahash); static void __cpool_free_entry(struct sigpool_entry *e) { crypto_free_ahash(e->hash); kfree(e->alg); memset(e, 0, sizeof(*e)); } static void cpool_cleanup_work_cb(struct work_struct *work) { bool free_scratch = true; unsigned int i; mutex_lock(&cpool_mutex); for (i = 0; i < cpool_populated; i++) { if (kref_read(&cpool[i].kref) > 0) { free_scratch = false; continue; } if (!cpool[i].alg) continue; __cpool_free_entry(&cpool[i]); } if (free_scratch) sigpool_scratch_free(); mutex_unlock(&cpool_mutex); } static DECLARE_WORK(cpool_cleanup_work, cpool_cleanup_work_cb); static void cpool_schedule_cleanup(struct kref *kref) { schedule_work(&cpool_cleanup_work); } /** * tcp_sigpool_release - decreases number of users for a pool. If it was * the last user of the pool, releases any memory that was consumed. * @id: tcp_sigpool that was previously allocated by tcp_sigpool_alloc_ahash() */ void tcp_sigpool_release(unsigned int id) { if (WARN_ON_ONCE(id >= cpool_populated || !cpool[id].alg)) return; /* slow-path */ kref_put(&cpool[id].kref, cpool_schedule_cleanup); } EXPORT_SYMBOL_GPL(tcp_sigpool_release); /** * tcp_sigpool_get - increases number of users (refcounter) for a pool * @id: tcp_sigpool that was previously allocated by tcp_sigpool_alloc_ahash() */ void tcp_sigpool_get(unsigned int id) { if (WARN_ON_ONCE(id >= cpool_populated || !cpool[id].alg)) return; kref_get(&cpool[id].kref); } EXPORT_SYMBOL_GPL(tcp_sigpool_get); int tcp_sigpool_start(unsigned int id, struct tcp_sigpool *c) __cond_acquires(RCU_BH) { struct crypto_ahash *hash; rcu_read_lock_bh(); if (WARN_ON_ONCE(id >= cpool_populated || !cpool[id].alg)) { rcu_read_unlock_bh(); return -EINVAL; } hash = crypto_clone_ahash(cpool[id].hash); if (IS_ERR(hash)) { rcu_read_unlock_bh(); return PTR_ERR(hash); } c->req = ahash_request_alloc(hash, GFP_ATOMIC); if (!c->req) { crypto_free_ahash(hash); rcu_read_unlock_bh(); return -ENOMEM; } ahash_request_set_callback(c->req, 0, NULL, NULL); /* Pairs with tcp_sigpool_reserve_scratch(), scratch area is * valid (allocated) until tcp_sigpool_end(). */ c->scratch = rcu_dereference_bh(*this_cpu_ptr(&sigpool_scratch)); return 0; } EXPORT_SYMBOL_GPL(tcp_sigpool_start); void tcp_sigpool_end(struct tcp_sigpool *c) __releases(RCU_BH) { struct crypto_ahash *hash = crypto_ahash_reqtfm(c->req); rcu_read_unlock_bh(); ahash_request_free(c->req); crypto_free_ahash(hash); } EXPORT_SYMBOL_GPL(tcp_sigpool_end); /** * tcp_sigpool_algo - return algorithm of tcp_sigpool * @id: tcp_sigpool that was previously allocated by tcp_sigpool_alloc_ahash() * @buf: buffer to return name of algorithm * @buf_len: size of @buf */ size_t tcp_sigpool_algo(unsigned int id, char *buf, size_t buf_len) { if (WARN_ON_ONCE(id >= cpool_populated || !cpool[id].alg)) return -EINVAL; return strscpy(buf, cpool[id].alg, buf_len); } EXPORT_SYMBOL_GPL(tcp_sigpool_algo); /** * tcp_sigpool_hash_skb_data - hash data in skb with initialized tcp_sigpool * @hp: tcp_sigpool pointer * @skb: buffer to add sign for * @header_len: TCP header length for this segment */ int tcp_sigpool_hash_skb_data(struct tcp_sigpool *hp, const struct sk_buff *skb, unsigned int header_len) { const unsigned int head_data_len = skb_headlen(skb) > header_len ? skb_headlen(skb) - header_len : 0; const struct skb_shared_info *shi = skb_shinfo(skb); const struct tcphdr *tp = tcp_hdr(skb); struct ahash_request *req = hp->req; struct sk_buff *frag_iter; struct scatterlist sg; unsigned int i; sg_init_table(&sg, 1); sg_set_buf(&sg, ((u8 *)tp) + header_len, head_data_len); ahash_request_set_crypt(req, &sg, NULL, head_data_len); if (crypto_ahash_update(req)) return 1; for (i = 0; i < shi->nr_frags; ++i) { const skb_frag_t *f = &shi->frags[i]; unsigned int offset = skb_frag_off(f); struct page *page; page = skb_frag_page(f) + (offset >> PAGE_SHIFT); sg_set_page(&sg, page, skb_frag_size(f), offset_in_page(offset)); ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f)); if (crypto_ahash_update(req)) return 1; } skb_walk_frags(skb, frag_iter) if (tcp_sigpool_hash_skb_data(hp, frag_iter, 0)) return 1; return 0; } EXPORT_SYMBOL(tcp_sigpool_hash_skb_data); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Per-CPU pool of crypto requests"); |
1 31 57 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BLOCK_BLK_PM_H_ #define _BLOCK_BLK_PM_H_ #include <linux/pm_runtime.h> #ifdef CONFIG_PM static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q) { if (!q->dev || !blk_queue_pm_only(q)) return 1; /* Nothing to do */ if (pm && q->rpm_status != RPM_SUSPENDED) return 1; /* Request allowed */ pm_request_resume(q->dev); return 0; } static inline void blk_pm_mark_last_busy(struct request *rq) { if (rq->q->dev && !(rq->rq_flags & RQF_PM)) pm_runtime_mark_last_busy(rq->q->dev); } #else static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q) { return 1; } static inline void blk_pm_mark_last_busy(struct request *rq) { } #endif #endif /* _BLOCK_BLK_PM_H_ */ |
2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 | /* * linux/fs/nls/nls_cp855.c * * Charset cp855 translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x0452, 0x0402, 0x0453, 0x0403, 0x0451, 0x0401, 0x0454, 0x0404, 0x0455, 0x0405, 0x0456, 0x0406, 0x0457, 0x0407, 0x0458, 0x0408, /* 0x90*/ 0x0459, 0x0409, 0x045a, 0x040a, 0x045b, 0x040b, 0x045c, 0x040c, 0x045e, 0x040e, 0x045f, 0x040f, 0x044e, 0x042e, 0x044a, 0x042a, /* 0xa0*/ 0x0430, 0x0410, 0x0431, 0x0411, 0x0446, 0x0426, 0x0434, 0x0414, 0x0435, 0x0415, 0x0444, 0x0424, 0x0433, 0x0413, 0x00ab, 0x00bb, /* 0xb0*/ 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x0445, 0x0425, 0x0438, 0x0418, 0x2563, 0x2551, 0x2557, 0x255d, 0x0439, 0x0419, 0x2510, /* 0xc0*/ 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x043a, 0x041a, 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4, /* 0xd0*/ 0x043b, 0x041b, 0x043c, 0x041c, 0x043d, 0x041d, 0x043e, 0x041e, 0x043f, 0x2518, 0x250c, 0x2588, 0x2584, 0x041f, 0x044f, 0x2580, /* 0xe0*/ 0x042f, 0x0440, 0x0420, 0x0441, 0x0421, 0x0442, 0x0422, 0x0443, 0x0423, 0x0436, 0x0416, 0x0432, 0x0412, 0x044c, 0x042c, 0x2116, /* 0xf0*/ 0x00ad, 0x044b, 0x042b, 0x0437, 0x0417, 0x0448, 0x0428, 0x044d, 0x042d, 0x0449, 0x0429, 0x0447, 0x0427, 0x00a7, 0x25a0, 0x00a0, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xff, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0xfd, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0xae, 0x00, 0xf0, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0xaf, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ }; static const unsigned char page04[256] = { 0x00, 0x85, 0x81, 0x83, 0x87, 0x89, 0x8b, 0x8d, /* 0x00-0x07 */ 0x8f, 0x91, 0x93, 0x95, 0x97, 0x00, 0x99, 0x9b, /* 0x08-0x0f */ 0xa1, 0xa3, 0xec, 0xad, 0xa7, 0xa9, 0xea, 0xf4, /* 0x10-0x17 */ 0xb8, 0xbe, 0xc7, 0xd1, 0xd3, 0xd5, 0xd7, 0xdd, /* 0x18-0x1f */ 0xe2, 0xe4, 0xe6, 0xe8, 0xab, 0xb6, 0xa5, 0xfc, /* 0x20-0x27 */ 0xf6, 0xfa, 0x9f, 0xf2, 0xee, 0xf8, 0x9d, 0xe0, /* 0x28-0x2f */ 0xa0, 0xa2, 0xeb, 0xac, 0xa6, 0xa8, 0xe9, 0xf3, /* 0x30-0x37 */ 0xb7, 0xbd, 0xc6, 0xd0, 0xd2, 0xd4, 0xd6, 0xd8, /* 0x38-0x3f */ 0xe1, 0xe3, 0xe5, 0xe7, 0xaa, 0xb5, 0xa4, 0xfb, /* 0x40-0x47 */ 0xf5, 0xf9, 0x9e, 0xf1, 0xed, 0xf7, 0x9c, 0xde, /* 0x48-0x4f */ 0x00, 0x84, 0x80, 0x82, 0x86, 0x88, 0x8a, 0x8c, /* 0x50-0x57 */ 0x8e, 0x90, 0x92, 0x94, 0x96, 0x00, 0x98, 0x9a, /* 0x58-0x5f */ }; static const unsigned char page21[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xef, 0x00, /* 0x10-0x17 */ }; static const unsigned char page25[256] = { 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */ 0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */ 0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */ 0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0xdb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ }; static const unsigned char *const page_uni2charset[256] = { page00, NULL, NULL, NULL, page04, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page21, NULL, NULL, NULL, page25, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x80, 0x82, 0x82, 0x84, 0x84, 0x86, 0x86, /* 0x80-0x87 */ 0x88, 0x88, 0x8a, 0x8a, 0x8c, 0x8c, 0x8e, 0x8e, /* 0x88-0x8f */ 0x90, 0x90, 0x92, 0x92, 0x94, 0x94, 0x96, 0x96, /* 0x90-0x97 */ 0x98, 0x98, 0x9a, 0x9a, 0x9c, 0x9c, 0x9e, 0x9e, /* 0x98-0x9f */ 0xa0, 0xa0, 0xa2, 0xa2, 0xa4, 0xa4, 0xa6, 0xa6, /* 0xa0-0xa7 */ 0xa8, 0xa8, 0xaa, 0xaa, 0xac, 0xac, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb5, 0xb7, /* 0xb0-0xb7 */ 0xb7, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbd, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc6, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd0, 0xd2, 0xd2, 0xd4, 0xd4, 0xd6, 0xd6, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xd8, 0xde, 0xdf, /* 0xd8-0xdf */ 0xde, 0xe1, 0xe1, 0xe3, 0xe3, 0xe5, 0xe5, 0xe7, /* 0xe0-0xe7 */ 0xe7, 0xe9, 0xe9, 0xeb, 0xeb, 0xed, 0xed, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf1, 0xf3, 0xf3, 0xf5, 0xf5, 0xf7, /* 0xf0-0xf7 */ 0xf7, 0xf9, 0xf9, 0xfb, 0xfb, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x81, 0x81, 0x83, 0x83, 0x85, 0x85, 0x87, 0x87, /* 0x80-0x87 */ 0x89, 0x89, 0x8b, 0x8b, 0x8d, 0x8d, 0x8f, 0x8f, /* 0x88-0x8f */ 0x91, 0x91, 0x93, 0x93, 0x95, 0x95, 0x97, 0x97, /* 0x90-0x97 */ 0x99, 0x99, 0x9b, 0x9b, 0x9d, 0x9d, 0x9f, 0x9f, /* 0x98-0x9f */ 0xa1, 0xa1, 0xa3, 0xa3, 0xa5, 0xa5, 0xa7, 0xa7, /* 0xa0-0xa7 */ 0xa9, 0xa9, 0xab, 0xab, 0xad, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb6, 0xb6, 0xb8, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbe, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc7, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd1, 0xd1, 0xd3, 0xd3, 0xd5, 0xd5, 0xd7, 0xd7, /* 0xd0-0xd7 */ 0xdd, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xe0, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe2, 0xe2, 0xe4, 0xe4, 0xe6, 0xe6, 0xe8, /* 0xe0-0xe7 */ 0xe8, 0xea, 0xea, 0xec, 0xec, 0xee, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf2, 0xf2, 0xf4, 0xf4, 0xf6, 0xf6, 0xf8, /* 0xf0-0xf7 */ 0xf8, 0xfa, 0xfa, 0xfc, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "cp855", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_cp855(void) { return register_nls(&table); } static void __exit exit_nls_cp855(void) { unregister_nls(&table); } module_init(init_nls_cp855) module_exit(exit_nls_cp855) MODULE_LICENSE("Dual BSD/GPL"); |
1 2 1 1 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 | // SPDX-License-Identifier: GPL-2.0-only /* * TCP HYBLA * * TCP-HYBLA Congestion control algorithm, based on: * C.Caini, R.Firrincieli, "TCP-Hybla: A TCP Enhancement * for Heterogeneous Networks", * International Journal on satellite Communications, * September 2004 * Daniele Lacamera * root at danielinux.net */ #include <linux/module.h> #include <net/tcp.h> /* Tcp Hybla structure. */ struct hybla { bool hybla_en; u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */ u32 rho; /* Rho parameter, integer part */ u32 rho2; /* Rho * Rho, integer part */ u32 rho_3ls; /* Rho parameter, <<3 */ u32 rho2_7ls; /* Rho^2, <<7 */ u32 minrtt_us; /* Minimum smoothed round trip time value seen */ }; /* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */ static int rtt0 = 25; module_param(rtt0, int, 0644); MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)"); /* This is called to refresh values for hybla parameters */ static inline void hybla_recalc_param (struct sock *sk) { struct hybla *ca = inet_csk_ca(sk); ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt_us / (rtt0 * USEC_PER_MSEC), 8U); ca->rho = ca->rho_3ls >> 3; ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; ca->rho2 = ca->rho2_7ls >> 7; } static void hybla_init(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct hybla *ca = inet_csk_ca(sk); ca->rho = 0; ca->rho2 = 0; ca->rho_3ls = 0; ca->rho2_7ls = 0; ca->snd_cwnd_cents = 0; ca->hybla_en = true; tcp_snd_cwnd_set(tp, 2); tp->snd_cwnd_clamp = 65535; /* 1st Rho measurement based on initial srtt */ hybla_recalc_param(sk); /* set minimum rtt as this is the 1st ever seen */ ca->minrtt_us = tp->srtt_us; tcp_snd_cwnd_set(tp, ca->rho); } static void hybla_state(struct sock *sk, u8 ca_state) { struct hybla *ca = inet_csk_ca(sk); ca->hybla_en = (ca_state == TCP_CA_Open); } static inline u32 hybla_fraction(u32 odds) { static const u32 fractions[] = { 128, 139, 152, 165, 181, 197, 215, 234, }; return (odds < ARRAY_SIZE(fractions)) ? fractions[odds] : 128; } /* TCP Hybla main routine. * This is the algorithm behavior: * o Recalc Hybla parameters if min_rtt has changed * o Give cwnd a new value based on the model proposed * o remember increments <1 */ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct hybla *ca = inet_csk_ca(sk); u32 increment, odd, rho_fractions; int is_slowstart = 0; /* Recalculate rho only if this srtt is the lowest */ if (tp->srtt_us < ca->minrtt_us) { hybla_recalc_param(sk); ca->minrtt_us = tp->srtt_us; } if (!tcp_is_cwnd_limited(sk)) return; if (!ca->hybla_en) { tcp_reno_cong_avoid(sk, ack, acked); return; } if (ca->rho == 0) hybla_recalc_param(sk); rho_fractions = ca->rho_3ls - (ca->rho << 3); if (tcp_in_slow_start(tp)) { /* * slow start * INC = 2^RHO - 1 * This is done by splitting the rho parameter * into 2 parts: an integer part and a fraction part. * Inrement<<7 is estimated by doing: * [2^(int+fract)]<<7 * that is equal to: * (2^int) * [(2^fract) <<7] * 2^int is straightly computed as 1<<int, * while we will use hybla_slowstart_fraction_increment() to * calculate 2^fract in a <<7 value. */ is_slowstart = 1; increment = ((1 << min(ca->rho, 16U)) * hybla_fraction(rho_fractions)) - 128; } else { /* * congestion avoidance * INC = RHO^2 / W * as long as increment is estimated as (rho<<7)/window * it already is <<7 and we can easily count its fractions. */ increment = ca->rho2_7ls / tcp_snd_cwnd(tp); if (increment < 128) tp->snd_cwnd_cnt++; } odd = increment % 128; tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + (increment >> 7)); ca->snd_cwnd_cents += odd; /* check when fractions goes >=128 and increase cwnd by 1. */ while (ca->snd_cwnd_cents >= 128) { tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1); ca->snd_cwnd_cents -= 128; tp->snd_cwnd_cnt = 0; } /* check when cwnd has not been incremented for a while */ if (increment == 0 && odd == 0 && tp->snd_cwnd_cnt >= tcp_snd_cwnd(tp)) { tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1); tp->snd_cwnd_cnt = 0; } /* clamp down slowstart cwnd to ssthresh value. */ if (is_slowstart) tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_ssthresh)); tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_cwnd_clamp)); } static struct tcp_congestion_ops tcp_hybla __read_mostly = { .init = hybla_init, .ssthresh = tcp_reno_ssthresh, .undo_cwnd = tcp_reno_undo_cwnd, .cong_avoid = hybla_cong_avoid, .set_state = hybla_state, .owner = THIS_MODULE, .name = "hybla" }; static int __init hybla_register(void) { BUILD_BUG_ON(sizeof(struct hybla) > ICSK_CA_PRIV_SIZE); return tcp_register_congestion_control(&tcp_hybla); } static void __exit hybla_unregister(void) { tcp_unregister_congestion_control(&tcp_hybla); } module_init(hybla_register); module_exit(hybla_unregister); MODULE_AUTHOR("Daniele Lacamera"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("TCP Hybla"); |
1 1 9 9 9 1 1 1 8 8 1 1 2 2 1 1 5 1 1 2 2 4 4 3 1 2 2 2 8 7 5 5 4 2 1 2 4 3 2 5 3 18 19 19 19 16 2 1 7 11 18 14 14 1 2 1 1 1 13 13 1 9 9 11 2 5 5 6 6 2 5 2 5 6 2 1 1 2 3 1 1 1 4 1 1 1 2 1 1 7 1 6 1 1 2 7 1 1 1 1 1 8 8 2 8 4 1 2 2 2 1 1 4 3 2 1 3 2 1 2 2 2 1 1 1 2 18 2 6 4 1 2 3 5 5 5 7 1 6 7 2 5 7 7 8 1 7 6 1 7 7 7 7 4 3 2 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 | // SPDX-License-Identifier: GPL-2.0-only /* * Kernel Connection Multiplexor * * Copyright (c) 2016 Tom Herbert <tom@herbertland.com> */ #include <linux/bpf.h> #include <linux/errno.h> #include <linux/errqueue.h> #include <linux/file.h> #include <linux/filter.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/poll.h> #include <linux/rculist.h> #include <linux/skbuff.h> #include <linux/socket.h> #include <linux/uaccess.h> #include <linux/workqueue.h> #include <linux/syscalls.h> #include <linux/sched/signal.h> #include <net/kcm.h> #include <net/netns/generic.h> #include <net/sock.h> #include <uapi/linux/kcm.h> #include <trace/events/sock.h> unsigned int kcm_net_id; static struct kmem_cache *kcm_psockp __read_mostly; static struct kmem_cache *kcm_muxp __read_mostly; static struct workqueue_struct *kcm_wq; static inline struct kcm_sock *kcm_sk(const struct sock *sk) { return (struct kcm_sock *)sk; } static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb) { return (struct kcm_tx_msg *)skb->cb; } static void report_csk_error(struct sock *csk, int err) { csk->sk_err = EPIPE; sk_error_report(csk); } static void kcm_abort_tx_psock(struct kcm_psock *psock, int err, bool wakeup_kcm) { struct sock *csk = psock->sk; struct kcm_mux *mux = psock->mux; /* Unrecoverable error in transmit */ spin_lock_bh(&mux->lock); if (psock->tx_stopped) { spin_unlock_bh(&mux->lock); return; } psock->tx_stopped = 1; KCM_STATS_INCR(psock->stats.tx_aborts); if (!psock->tx_kcm) { /* Take off psocks_avail list */ list_del(&psock->psock_avail_list); } else if (wakeup_kcm) { /* In this case psock is being aborted while outside of * write_msgs and psock is reserved. Schedule tx_work * to handle the failure there. Need to commit tx_stopped * before queuing work. */ smp_mb(); queue_work(kcm_wq, &psock->tx_kcm->tx_work); } spin_unlock_bh(&mux->lock); /* Report error on lower socket */ report_csk_error(csk, err); } /* RX mux lock held. */ static void kcm_update_rx_mux_stats(struct kcm_mux *mux, struct kcm_psock *psock) { STRP_STATS_ADD(mux->stats.rx_bytes, psock->strp.stats.bytes - psock->saved_rx_bytes); mux->stats.rx_msgs += psock->strp.stats.msgs - psock->saved_rx_msgs; psock->saved_rx_msgs = psock->strp.stats.msgs; psock->saved_rx_bytes = psock->strp.stats.bytes; } static void kcm_update_tx_mux_stats(struct kcm_mux *mux, struct kcm_psock *psock) { KCM_STATS_ADD(mux->stats.tx_bytes, psock->stats.tx_bytes - psock->saved_tx_bytes); mux->stats.tx_msgs += psock->stats.tx_msgs - psock->saved_tx_msgs; psock->saved_tx_msgs = psock->stats.tx_msgs; psock->saved_tx_bytes = psock->stats.tx_bytes; } static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); /* KCM is ready to receive messages on its queue-- either the KCM is new or * has become unblocked after being blocked on full socket buffer. Queue any * pending ready messages on a psock. RX mux lock held. */ static void kcm_rcv_ready(struct kcm_sock *kcm) { struct kcm_mux *mux = kcm->mux; struct kcm_psock *psock; struct sk_buff *skb; if (unlikely(kcm->rx_wait || kcm->rx_psock || kcm->rx_disabled)) return; while (unlikely((skb = __skb_dequeue(&mux->rx_hold_queue)))) { if (kcm_queue_rcv_skb(&kcm->sk, skb)) { /* Assuming buffer limit has been reached */ skb_queue_head(&mux->rx_hold_queue, skb); WARN_ON(!sk_rmem_alloc_get(&kcm->sk)); return; } } while (!list_empty(&mux->psocks_ready)) { psock = list_first_entry(&mux->psocks_ready, struct kcm_psock, psock_ready_list); if (kcm_queue_rcv_skb(&kcm->sk, psock->ready_rx_msg)) { /* Assuming buffer limit has been reached */ WARN_ON(!sk_rmem_alloc_get(&kcm->sk)); return; } /* Consumed the ready message on the psock. Schedule rx_work to * get more messages. */ list_del(&psock->psock_ready_list); psock->ready_rx_msg = NULL; /* Commit clearing of ready_rx_msg for queuing work */ smp_mb(); strp_unpause(&psock->strp); strp_check_rcv(&psock->strp); } /* Buffer limit is okay now, add to ready list */ list_add_tail(&kcm->wait_rx_list, &kcm->mux->kcm_rx_waiters); /* paired with lockless reads in kcm_rfree() */ WRITE_ONCE(kcm->rx_wait, true); } static void kcm_rfree(struct sk_buff *skb) { struct sock *sk = skb->sk; struct kcm_sock *kcm = kcm_sk(sk); struct kcm_mux *mux = kcm->mux; unsigned int len = skb->truesize; sk_mem_uncharge(sk, len); atomic_sub(len, &sk->sk_rmem_alloc); /* For reading rx_wait and rx_psock without holding lock */ smp_mb__after_atomic(); if (!READ_ONCE(kcm->rx_wait) && !READ_ONCE(kcm->rx_psock) && sk_rmem_alloc_get(sk) < sk->sk_rcvlowat) { spin_lock_bh(&mux->rx_lock); kcm_rcv_ready(kcm); spin_unlock_bh(&mux->rx_lock); } } static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct sk_buff_head *list = &sk->sk_receive_queue; if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) return -ENOMEM; if (!sk_rmem_schedule(sk, skb, skb->truesize)) return -ENOBUFS; skb->dev = NULL; skb_orphan(skb); skb->sk = sk; skb->destructor = kcm_rfree; atomic_add(skb->truesize, &sk->sk_rmem_alloc); sk_mem_charge(sk, skb->truesize); skb_queue_tail(list, skb); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); return 0; } /* Requeue received messages for a kcm socket to other kcm sockets. This is * called with a kcm socket is receive disabled. * RX mux lock held. */ static void requeue_rx_msgs(struct kcm_mux *mux, struct sk_buff_head *head) { struct sk_buff *skb; struct kcm_sock *kcm; while ((skb = skb_dequeue(head))) { /* Reset destructor to avoid calling kcm_rcv_ready */ skb->destructor = sock_rfree; skb_orphan(skb); try_again: if (list_empty(&mux->kcm_rx_waiters)) { skb_queue_tail(&mux->rx_hold_queue, skb); continue; } kcm = list_first_entry(&mux->kcm_rx_waiters, struct kcm_sock, wait_rx_list); if (kcm_queue_rcv_skb(&kcm->sk, skb)) { /* Should mean socket buffer full */ list_del(&kcm->wait_rx_list); /* paired with lockless reads in kcm_rfree() */ WRITE_ONCE(kcm->rx_wait, false); /* Commit rx_wait to read in kcm_free */ smp_wmb(); goto try_again; } } } /* Lower sock lock held */ static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock, struct sk_buff *head) { struct kcm_mux *mux = psock->mux; struct kcm_sock *kcm; WARN_ON(psock->ready_rx_msg); if (psock->rx_kcm) return psock->rx_kcm; spin_lock_bh(&mux->rx_lock); if (psock->rx_kcm) { spin_unlock_bh(&mux->rx_lock); return psock->rx_kcm; } kcm_update_rx_mux_stats(mux, psock); if (list_empty(&mux->kcm_rx_waiters)) { psock->ready_rx_msg = head; strp_pause(&psock->strp); list_add_tail(&psock->psock_ready_list, &mux->psocks_ready); spin_unlock_bh(&mux->rx_lock); return NULL; } kcm = list_first_entry(&mux->kcm_rx_waiters, struct kcm_sock, wait_rx_list); list_del(&kcm->wait_rx_list); /* paired with lockless reads in kcm_rfree() */ WRITE_ONCE(kcm->rx_wait, false); psock->rx_kcm = kcm; /* paired with lockless reads in kcm_rfree() */ WRITE_ONCE(kcm->rx_psock, psock); spin_unlock_bh(&mux->rx_lock); return kcm; } static void kcm_done(struct kcm_sock *kcm); static void kcm_done_work(struct work_struct *w) { kcm_done(container_of(w, struct kcm_sock, done_work)); } /* Lower sock held */ static void unreserve_rx_kcm(struct kcm_psock *psock, bool rcv_ready) { struct kcm_sock *kcm = psock->rx_kcm; struct kcm_mux *mux = psock->mux; if (!kcm) return; spin_lock_bh(&mux->rx_lock); psock->rx_kcm = NULL; /* paired with lockless reads in kcm_rfree() */ WRITE_ONCE(kcm->rx_psock, NULL); /* Commit kcm->rx_psock before sk_rmem_alloc_get to sync with * kcm_rfree */ smp_mb(); if (unlikely(kcm->done)) { spin_unlock_bh(&mux->rx_lock); /* Need to run kcm_done in a task since we need to qcquire * callback locks which may already be held here. */ INIT_WORK(&kcm->done_work, kcm_done_work); schedule_work(&kcm->done_work); return; } if (unlikely(kcm->rx_disabled)) { requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue); } else if (rcv_ready || unlikely(!sk_rmem_alloc_get(&kcm->sk))) { /* Check for degenerative race with rx_wait that all * data was dequeued (accounted for in kcm_rfree). */ kcm_rcv_ready(kcm); } spin_unlock_bh(&mux->rx_lock); } /* Lower sock lock held */ static void psock_data_ready(struct sock *sk) { struct kcm_psock *psock; trace_sk_data_ready(sk); read_lock_bh(&sk->sk_callback_lock); psock = (struct kcm_psock *)sk->sk_user_data; if (likely(psock)) strp_data_ready(&psock->strp); read_unlock_bh(&sk->sk_callback_lock); } /* Called with lower sock held */ static void kcm_rcv_strparser(struct strparser *strp, struct sk_buff *skb) { struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp); struct kcm_sock *kcm; try_queue: kcm = reserve_rx_kcm(psock, skb); if (!kcm) { /* Unable to reserve a KCM, message is held in psock and strp * is paused. */ return; } if (kcm_queue_rcv_skb(&kcm->sk, skb)) { /* Should mean socket buffer full */ unreserve_rx_kcm(psock, false); goto try_queue; } } static int kcm_parse_func_strparser(struct strparser *strp, struct sk_buff *skb) { struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp); struct bpf_prog *prog = psock->bpf_prog; int res; res = bpf_prog_run_pin_on_cpu(prog, skb); return res; } static int kcm_read_sock_done(struct strparser *strp, int err) { struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp); unreserve_rx_kcm(psock, true); return err; } static void psock_state_change(struct sock *sk) { /* TCP only does a EPOLLIN for a half close. Do a EPOLLHUP here * since application will normally not poll with EPOLLIN * on the TCP sockets. */ report_csk_error(sk, EPIPE); } static void psock_write_space(struct sock *sk) { struct kcm_psock *psock; struct kcm_mux *mux; struct kcm_sock *kcm; read_lock_bh(&sk->sk_callback_lock); psock = (struct kcm_psock *)sk->sk_user_data; if (unlikely(!psock)) goto out; mux = psock->mux; spin_lock_bh(&mux->lock); /* Check if the socket is reserved so someone is waiting for sending. */ kcm = psock->tx_kcm; if (kcm && !unlikely(kcm->tx_stopped)) queue_work(kcm_wq, &kcm->tx_work); spin_unlock_bh(&mux->lock); out: read_unlock_bh(&sk->sk_callback_lock); } static void unreserve_psock(struct kcm_sock *kcm); /* kcm sock is locked. */ static struct kcm_psock *reserve_psock(struct kcm_sock *kcm) { struct kcm_mux *mux = kcm->mux; struct kcm_psock *psock; psock = kcm->tx_psock; smp_rmb(); /* Must read tx_psock before tx_wait */ if (psock) { WARN_ON(kcm->tx_wait); if (unlikely(psock->tx_stopped)) unreserve_psock(kcm); else return kcm->tx_psock; } spin_lock_bh(&mux->lock); /* Check again under lock to see if psock was reserved for this * psock via psock_unreserve. */ psock = kcm->tx_psock; if (unlikely(psock)) { WARN_ON(kcm->tx_wait); spin_unlock_bh(&mux->lock); return kcm->tx_psock; } if (!list_empty(&mux->psocks_avail)) { psock = list_first_entry(&mux->psocks_avail, struct kcm_psock, psock_avail_list); list_del(&psock->psock_avail_list); if (kcm->tx_wait) { list_del(&kcm->wait_psock_list); kcm->tx_wait = false; } kcm->tx_psock = psock; psock->tx_kcm = kcm; KCM_STATS_INCR(psock->stats.reserved); } else if (!kcm->tx_wait) { list_add_tail(&kcm->wait_psock_list, &mux->kcm_tx_waiters); kcm->tx_wait = true; } spin_unlock_bh(&mux->lock); return psock; } /* mux lock held */ static void psock_now_avail(struct kcm_psock *psock) { struct kcm_mux *mux = psock->mux; struct kcm_sock *kcm; if (list_empty(&mux->kcm_tx_waiters)) { list_add_tail(&psock->psock_avail_list, &mux->psocks_avail); } else { kcm = list_first_entry(&mux->kcm_tx_waiters, struct kcm_sock, wait_psock_list); list_del(&kcm->wait_psock_list); kcm->tx_wait = false; psock->tx_kcm = kcm; /* Commit before changing tx_psock since that is read in * reserve_psock before queuing work. */ smp_mb(); kcm->tx_psock = psock; KCM_STATS_INCR(psock->stats.reserved); queue_work(kcm_wq, &kcm->tx_work); } } /* kcm sock is locked. */ static void unreserve_psock(struct kcm_sock *kcm) { struct kcm_psock *psock; struct kcm_mux *mux = kcm->mux; spin_lock_bh(&mux->lock); psock = kcm->tx_psock; if (WARN_ON(!psock)) { spin_unlock_bh(&mux->lock); return; } smp_rmb(); /* Read tx_psock before tx_wait */ kcm_update_tx_mux_stats(mux, psock); WARN_ON(kcm->tx_wait); kcm->tx_psock = NULL; psock->tx_kcm = NULL; KCM_STATS_INCR(psock->stats.unreserved); if (unlikely(psock->tx_stopped)) { if (psock->done) { /* Deferred free */ list_del(&psock->psock_list); mux->psocks_cnt--; sock_put(psock->sk); fput(psock->sk->sk_socket->file); kmem_cache_free(kcm_psockp, psock); } /* Don't put back on available list */ spin_unlock_bh(&mux->lock); return; } psock_now_avail(psock); spin_unlock_bh(&mux->lock); } static void kcm_report_tx_retry(struct kcm_sock *kcm) { struct kcm_mux *mux = kcm->mux; spin_lock_bh(&mux->lock); KCM_STATS_INCR(mux->stats.tx_retries); spin_unlock_bh(&mux->lock); } /* Write any messages ready on the kcm socket. Called with kcm sock lock * held. Return bytes actually sent or error. */ static int kcm_write_msgs(struct kcm_sock *kcm) { unsigned int total_sent = 0; struct sock *sk = &kcm->sk; struct kcm_psock *psock; struct sk_buff *head; int ret = 0; kcm->tx_wait_more = false; psock = kcm->tx_psock; if (unlikely(psock && psock->tx_stopped)) { /* A reserved psock was aborted asynchronously. Unreserve * it and we'll retry the message. */ unreserve_psock(kcm); kcm_report_tx_retry(kcm); if (skb_queue_empty(&sk->sk_write_queue)) return 0; kcm_tx_msg(skb_peek(&sk->sk_write_queue))->started_tx = false; } retry: while ((head = skb_peek(&sk->sk_write_queue))) { struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, }; struct kcm_tx_msg *txm = kcm_tx_msg(head); struct sk_buff *skb; unsigned int msize; int i; if (!txm->started_tx) { psock = reserve_psock(kcm); if (!psock) goto out; skb = head; txm->frag_offset = 0; txm->sent = 0; txm->started_tx = true; } else { if (WARN_ON(!psock)) { ret = -EINVAL; goto out; } skb = txm->frag_skb; } if (WARN_ON(!skb_shinfo(skb)->nr_frags) || WARN_ON_ONCE(!skb_frag_page(&skb_shinfo(skb)->frags[0]))) { ret = -EINVAL; goto out; } msize = 0; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) msize += skb_frag_size(&skb_shinfo(skb)->frags[i]); iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, (const struct bio_vec *)skb_shinfo(skb)->frags, skb_shinfo(skb)->nr_frags, msize); iov_iter_advance(&msg.msg_iter, txm->frag_offset); do { ret = sock_sendmsg(psock->sk->sk_socket, &msg); if (ret <= 0) { if (ret == -EAGAIN) { /* Save state to try again when there's * write space on the socket */ txm->frag_skb = skb; ret = 0; goto out; } /* Hard failure in sending message, abort this * psock since it has lost framing * synchronization and retry sending the * message from the beginning. */ kcm_abort_tx_psock(psock, ret ? -ret : EPIPE, true); unreserve_psock(kcm); psock = NULL; txm->started_tx = false; kcm_report_tx_retry(kcm); ret = 0; goto retry; } txm->sent += ret; txm->frag_offset += ret; KCM_STATS_ADD(psock->stats.tx_bytes, ret); } while (msg.msg_iter.count > 0); if (skb == head) { if (skb_has_frag_list(skb)) { txm->frag_skb = skb_shinfo(skb)->frag_list; txm->frag_offset = 0; continue; } } else if (skb->next) { txm->frag_skb = skb->next; txm->frag_offset = 0; continue; } /* Successfully sent the whole packet, account for it. */ sk->sk_wmem_queued -= txm->sent; total_sent += txm->sent; skb_dequeue(&sk->sk_write_queue); kfree_skb(head); KCM_STATS_INCR(psock->stats.tx_msgs); } out: if (!head) { /* Done with all queued messages. */ WARN_ON(!skb_queue_empty(&sk->sk_write_queue)); if (psock) unreserve_psock(kcm); } /* Check if write space is available */ sk->sk_write_space(sk); return total_sent ? : ret; } static void kcm_tx_work(struct work_struct *w) { struct kcm_sock *kcm = container_of(w, struct kcm_sock, tx_work); struct sock *sk = &kcm->sk; int err; lock_sock(sk); /* Primarily for SOCK_DGRAM sockets, also handle asynchronous tx * aborts */ err = kcm_write_msgs(kcm); if (err < 0) { /* Hard failure in write, report error on KCM socket */ pr_warn("KCM: Hard failure on kcm_write_msgs %d\n", err); report_csk_error(&kcm->sk, -err); goto out; } /* Primarily for SOCK_SEQPACKET sockets */ if (likely(sk->sk_socket) && test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk->sk_write_space(sk); } out: release_sock(sk); } static void kcm_push(struct kcm_sock *kcm) { if (kcm->tx_wait_more) kcm_write_msgs(kcm); } static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct kcm_sock *kcm = kcm_sk(sk); struct sk_buff *skb = NULL, *head = NULL; size_t copy, copied = 0; long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); int eor = (sock->type == SOCK_DGRAM) ? !(msg->msg_flags & MSG_MORE) : !!(msg->msg_flags & MSG_EOR); int err = -EPIPE; lock_sock(sk); /* Per tcp_sendmsg this should be in poll */ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (sk->sk_err) goto out_error; if (kcm->seq_skb) { /* Previously opened message */ head = kcm->seq_skb; skb = kcm_tx_msg(head)->last_skb; goto start; } /* Call the sk_stream functions to manage the sndbuf mem. */ if (!sk_stream_memory_free(sk)) { kcm_push(kcm); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); err = sk_stream_wait_memory(sk, &timeo); if (err) goto out_error; } if (msg_data_left(msg)) { /* New message, alloc head skb */ head = alloc_skb(0, sk->sk_allocation); while (!head) { kcm_push(kcm); err = sk_stream_wait_memory(sk, &timeo); if (err) goto out_error; head = alloc_skb(0, sk->sk_allocation); } skb = head; /* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling * csum_and_copy_from_iter from skb_do_copy_data_nocache. */ skb->ip_summed = CHECKSUM_UNNECESSARY; } start: while (msg_data_left(msg)) { bool merge = true; int i = skb_shinfo(skb)->nr_frags; struct page_frag *pfrag = sk_page_frag(sk); if (!sk_page_frag_refill(sk, pfrag)) goto wait_for_memory; if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { if (i == MAX_SKB_FRAGS) { struct sk_buff *tskb; tskb = alloc_skb(0, sk->sk_allocation); if (!tskb) goto wait_for_memory; if (head == skb) skb_shinfo(head)->frag_list = tskb; else skb->next = tskb; skb = tskb; skb->ip_summed = CHECKSUM_UNNECESSARY; continue; } merge = false; } if (msg->msg_flags & MSG_SPLICE_PAGES) { copy = msg_data_left(msg); if (!sk_wmem_schedule(sk, copy)) goto wait_for_memory; err = skb_splice_from_iter(skb, &msg->msg_iter, copy, sk->sk_allocation); if (err < 0) { if (err == -EMSGSIZE) goto wait_for_memory; goto out_error; } copy = err; skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; sk_wmem_queued_add(sk, copy); sk_mem_charge(sk, copy); if (head != skb) head->truesize += copy; } else { copy = min_t(int, msg_data_left(msg), pfrag->size - pfrag->offset); if (!sk_wmem_schedule(sk, copy)) goto wait_for_memory; err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, pfrag->page, pfrag->offset, copy); if (err) goto out_error; /* Update the skb. */ if (merge) { skb_frag_size_add( &skb_shinfo(skb)->frags[i - 1], copy); } else { skb_fill_page_desc(skb, i, pfrag->page, pfrag->offset, copy); get_page(pfrag->page); } pfrag->offset += copy; } copied += copy; if (head != skb) { head->len += copy; head->data_len += copy; } continue; wait_for_memory: kcm_push(kcm); err = sk_stream_wait_memory(sk, &timeo); if (err) goto out_error; } if (eor) { bool not_busy = skb_queue_empty(&sk->sk_write_queue); if (head) { /* Message complete, queue it on send buffer */ __skb_queue_tail(&sk->sk_write_queue, head); kcm->seq_skb = NULL; KCM_STATS_INCR(kcm->stats.tx_msgs); } if (msg->msg_flags & MSG_BATCH) { kcm->tx_wait_more = true; } else if (kcm->tx_wait_more || not_busy) { err = kcm_write_msgs(kcm); if (err < 0) { /* We got a hard error in write_msgs but have * already queued this message. Report an error * in the socket, but don't affect return value * from sendmsg */ pr_warn("KCM: Hard failure on kcm_write_msgs\n"); report_csk_error(&kcm->sk, -err); } } } else { /* Message not complete, save state */ partial_message: if (head) { kcm->seq_skb = head; kcm_tx_msg(head)->last_skb = skb; } } KCM_STATS_ADD(kcm->stats.tx_bytes, copied); release_sock(sk); return copied; out_error: kcm_push(kcm); if (sock->type == SOCK_SEQPACKET) { /* Wrote some bytes before encountering an * error, return partial success. */ if (copied) goto partial_message; if (head != kcm->seq_skb) kfree_skb(head); } else { kfree_skb(head); kcm->seq_skb = NULL; } err = sk_stream_error(sk, msg->msg_flags, err); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) sk->sk_write_space(sk); release_sock(sk); return err; } static void kcm_splice_eof(struct socket *sock) { struct sock *sk = sock->sk; struct kcm_sock *kcm = kcm_sk(sk); if (skb_queue_empty_lockless(&sk->sk_write_queue)) return; lock_sock(sk); kcm_write_msgs(kcm); release_sock(sk); } static int kcm_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct kcm_sock *kcm = kcm_sk(sk); int err = 0; struct strp_msg *stm; int copied = 0; struct sk_buff *skb; skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto out; /* Okay, have a message on the receive queue */ stm = strp_msg(skb); if (len > stm->full_len) len = stm->full_len; err = skb_copy_datagram_msg(skb, stm->offset, msg, len); if (err < 0) goto out; copied = len; if (likely(!(flags & MSG_PEEK))) { KCM_STATS_ADD(kcm->stats.rx_bytes, copied); if (copied < stm->full_len) { if (sock->type == SOCK_DGRAM) { /* Truncated message */ msg->msg_flags |= MSG_TRUNC; goto msg_finished; } stm->offset += copied; stm->full_len -= copied; } else { msg_finished: /* Finished with message */ msg->msg_flags |= MSG_EOR; KCM_STATS_INCR(kcm->stats.rx_msgs); } } out: skb_free_datagram(sk, skb); return copied ? : err; } static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct sock *sk = sock->sk; struct kcm_sock *kcm = kcm_sk(sk); struct strp_msg *stm; int err = 0; ssize_t copied; struct sk_buff *skb; /* Only support splice for SOCKSEQPACKET */ skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto err_out; /* Okay, have a message on the receive queue */ stm = strp_msg(skb); if (len > stm->full_len) len = stm->full_len; copied = skb_splice_bits(skb, sk, stm->offset, pipe, len, flags); if (copied < 0) { err = copied; goto err_out; } KCM_STATS_ADD(kcm->stats.rx_bytes, copied); stm->offset += copied; stm->full_len -= copied; /* We have no way to return MSG_EOR. If all the bytes have been * read we still leave the message in the receive socket buffer. * A subsequent recvmsg needs to be done to return MSG_EOR and * finish reading the message. */ skb_free_datagram(sk, skb); return copied; err_out: skb_free_datagram(sk, skb); return err; } /* kcm sock lock held */ static void kcm_recv_disable(struct kcm_sock *kcm) { struct kcm_mux *mux = kcm->mux; if (kcm->rx_disabled) return; spin_lock_bh(&mux->rx_lock); kcm->rx_disabled = 1; /* If a psock is reserved we'll do cleanup in unreserve */ if (!kcm->rx_psock) { if (kcm->rx_wait) { list_del(&kcm->wait_rx_list); /* paired with lockless reads in kcm_rfree() */ WRITE_ONCE(kcm->rx_wait, false); } requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue); } spin_unlock_bh(&mux->rx_lock); } /* kcm sock lock held */ static void kcm_recv_enable(struct kcm_sock *kcm) { struct kcm_mux *mux = kcm->mux; if (!kcm->rx_disabled) return; spin_lock_bh(&mux->rx_lock); kcm->rx_disabled = 0; kcm_rcv_ready(kcm); spin_unlock_bh(&mux->rx_lock); } static int kcm_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct kcm_sock *kcm = kcm_sk(sock->sk); int val, valbool; int err = 0; if (level != SOL_KCM) return -ENOPROTOOPT; if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; valbool = val ? 1 : 0; switch (optname) { case KCM_RECV_DISABLE: lock_sock(&kcm->sk); if (valbool) kcm_recv_disable(kcm); else kcm_recv_enable(kcm); release_sock(&kcm->sk); break; default: err = -ENOPROTOOPT; } return err; } static int kcm_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct kcm_sock *kcm = kcm_sk(sock->sk); int val, len; if (level != SOL_KCM) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; len = min_t(unsigned int, len, sizeof(int)); switch (optname) { case KCM_RECV_DISABLE: val = kcm->rx_disabled; break; default: return -ENOPROTOOPT; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux) { struct kcm_sock *tkcm; struct list_head *head; int index = 0; /* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so * we set sk_state, otherwise epoll_wait always returns right away with * EPOLLHUP */ kcm->sk.sk_state = TCP_ESTABLISHED; /* Add to mux's kcm sockets list */ kcm->mux = mux; spin_lock_bh(&mux->lock); head = &mux->kcm_socks; list_for_each_entry(tkcm, &mux->kcm_socks, kcm_sock_list) { if (tkcm->index != index) break; head = &tkcm->kcm_sock_list; index++; } list_add(&kcm->kcm_sock_list, head); kcm->index = index; mux->kcm_socks_cnt++; spin_unlock_bh(&mux->lock); INIT_WORK(&kcm->tx_work, kcm_tx_work); spin_lock_bh(&mux->rx_lock); kcm_rcv_ready(kcm); spin_unlock_bh(&mux->rx_lock); } static int kcm_attach(struct socket *sock, struct socket *csock, struct bpf_prog *prog) { struct kcm_sock *kcm = kcm_sk(sock->sk); struct kcm_mux *mux = kcm->mux; struct sock *csk; struct kcm_psock *psock = NULL, *tpsock; struct list_head *head; int index = 0; static const struct strp_callbacks cb = { .rcv_msg = kcm_rcv_strparser, .parse_msg = kcm_parse_func_strparser, .read_sock_done = kcm_read_sock_done, }; int err = 0; csk = csock->sk; if (!csk) return -EINVAL; lock_sock(csk); /* Only allow TCP sockets to be attached for now */ if ((csk->sk_family != AF_INET && csk->sk_family != AF_INET6) || csk->sk_protocol != IPPROTO_TCP) { err = -EOPNOTSUPP; goto out; } /* Don't allow listeners or closed sockets */ if (csk->sk_state == TCP_LISTEN || csk->sk_state == TCP_CLOSE) { err = -EOPNOTSUPP; goto out; } psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL); if (!psock) { err = -ENOMEM; goto out; } psock->mux = mux; psock->sk = csk; psock->bpf_prog = prog; write_lock_bh(&csk->sk_callback_lock); /* Check if sk_user_data is already by KCM or someone else. * Must be done under lock to prevent race conditions. */ if (csk->sk_user_data) { write_unlock_bh(&csk->sk_callback_lock); kmem_cache_free(kcm_psockp, psock); err = -EALREADY; goto out; } err = strp_init(&psock->strp, csk, &cb); if (err) { write_unlock_bh(&csk->sk_callback_lock); kmem_cache_free(kcm_psockp, psock); goto out; } psock->save_data_ready = csk->sk_data_ready; psock->save_write_space = csk->sk_write_space; psock->save_state_change = csk->sk_state_change; csk->sk_user_data = psock; csk->sk_data_ready = psock_data_ready; csk->sk_write_space = psock_write_space; csk->sk_state_change = psock_state_change; write_unlock_bh(&csk->sk_callback_lock); sock_hold(csk); /* Finished initialization, now add the psock to the MUX. */ spin_lock_bh(&mux->lock); head = &mux->psocks; list_for_each_entry(tpsock, &mux->psocks, psock_list) { if (tpsock->index != index) break; head = &tpsock->psock_list; index++; } list_add(&psock->psock_list, head); psock->index = index; KCM_STATS_INCR(mux->stats.psock_attach); mux->psocks_cnt++; psock_now_avail(psock); spin_unlock_bh(&mux->lock); /* Schedule RX work in case there are already bytes queued */ strp_check_rcv(&psock->strp); out: release_sock(csk); return err; } static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info) { struct socket *csock; struct bpf_prog *prog; int err; csock = sockfd_lookup(info->fd, &err); if (!csock) return -ENOENT; prog = bpf_prog_get_type(info->bpf_fd, BPF_PROG_TYPE_SOCKET_FILTER); if (IS_ERR(prog)) { err = PTR_ERR(prog); goto out; } err = kcm_attach(sock, csock, prog); if (err) { bpf_prog_put(prog); goto out; } /* Keep reference on file also */ return 0; out: sockfd_put(csock); return err; } static void kcm_unattach(struct kcm_psock *psock) { struct sock *csk = psock->sk; struct kcm_mux *mux = psock->mux; lock_sock(csk); /* Stop getting callbacks from TCP socket. After this there should * be no way to reserve a kcm for this psock. */ write_lock_bh(&csk->sk_callback_lock); csk->sk_user_data = NULL; csk->sk_data_ready = psock->save_data_ready; csk->sk_write_space = psock->save_write_space; csk->sk_state_change = psock->save_state_change; strp_stop(&psock->strp); if (WARN_ON(psock->rx_kcm)) { write_unlock_bh(&csk->sk_callback_lock); release_sock(csk); return; } spin_lock_bh(&mux->rx_lock); /* Stop receiver activities. After this point psock should not be * able to get onto ready list either through callbacks or work. */ if (psock->ready_rx_msg) { list_del(&psock->psock_ready_list); kfree_skb(psock->ready_rx_msg); psock->ready_rx_msg = NULL; KCM_STATS_INCR(mux->stats.rx_ready_drops); } spin_unlock_bh(&mux->rx_lock); write_unlock_bh(&csk->sk_callback_lock); /* Call strp_done without sock lock */ release_sock(csk); strp_done(&psock->strp); lock_sock(csk); bpf_prog_put(psock->bpf_prog); spin_lock_bh(&mux->lock); aggregate_psock_stats(&psock->stats, &mux->aggregate_psock_stats); save_strp_stats(&psock->strp, &mux->aggregate_strp_stats); KCM_STATS_INCR(mux->stats.psock_unattach); if (psock->tx_kcm) { /* psock was reserved. Just mark it finished and we will clean * up in the kcm paths, we need kcm lock which can not be * acquired here. */ KCM_STATS_INCR(mux->stats.psock_unattach_rsvd); spin_unlock_bh(&mux->lock); /* We are unattaching a socket that is reserved. Abort the * socket since we may be out of sync in sending on it. We need * to do this without the mux lock. */ kcm_abort_tx_psock(psock, EPIPE, false); spin_lock_bh(&mux->lock); if (!psock->tx_kcm) { /* psock now unreserved in window mux was unlocked */ goto no_reserved; } psock->done = 1; /* Commit done before queuing work to process it */ smp_mb(); /* Queue tx work to make sure psock->done is handled */ queue_work(kcm_wq, &psock->tx_kcm->tx_work); spin_unlock_bh(&mux->lock); } else { no_reserved: if (!psock->tx_stopped) list_del(&psock->psock_avail_list); list_del(&psock->psock_list); mux->psocks_cnt--; spin_unlock_bh(&mux->lock); sock_put(csk); fput(csk->sk_socket->file); kmem_cache_free(kcm_psockp, psock); } release_sock(csk); } static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info) { struct kcm_sock *kcm = kcm_sk(sock->sk); struct kcm_mux *mux = kcm->mux; struct kcm_psock *psock; struct socket *csock; struct sock *csk; int err; csock = sockfd_lookup(info->fd, &err); if (!csock) return -ENOENT; csk = csock->sk; if (!csk) { err = -EINVAL; goto out; } err = -ENOENT; spin_lock_bh(&mux->lock); list_for_each_entry(psock, &mux->psocks, psock_list) { if (psock->sk != csk) continue; /* Found the matching psock */ if (psock->unattaching || WARN_ON(psock->done)) { err = -EALREADY; break; } psock->unattaching = 1; spin_unlock_bh(&mux->lock); /* Lower socket lock should already be held */ kcm_unattach(psock); err = 0; goto out; } spin_unlock_bh(&mux->lock); out: sockfd_put(csock); return err; } static struct proto kcm_proto = { .name = "KCM", .owner = THIS_MODULE, .obj_size = sizeof(struct kcm_sock), }; /* Clone a kcm socket. */ static struct file *kcm_clone(struct socket *osock) { struct socket *newsock; struct sock *newsk; newsock = sock_alloc(); if (!newsock) return ERR_PTR(-ENFILE); newsock->type = osock->type; newsock->ops = osock->ops; __module_get(newsock->ops->owner); newsk = sk_alloc(sock_net(osock->sk), PF_KCM, GFP_KERNEL, &kcm_proto, false); if (!newsk) { sock_release(newsock); return ERR_PTR(-ENOMEM); } sock_init_data(newsock, newsk); init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux); return sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name); } static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { int err; switch (cmd) { case SIOCKCMATTACH: { struct kcm_attach info; if (copy_from_user(&info, (void __user *)arg, sizeof(info))) return -EFAULT; err = kcm_attach_ioctl(sock, &info); break; } case SIOCKCMUNATTACH: { struct kcm_unattach info; if (copy_from_user(&info, (void __user *)arg, sizeof(info))) return -EFAULT; err = kcm_unattach_ioctl(sock, &info); break; } case SIOCKCMCLONE: { struct kcm_clone info; struct file *file; info.fd = get_unused_fd_flags(0); if (unlikely(info.fd < 0)) return info.fd; file = kcm_clone(sock); if (IS_ERR(file)) { put_unused_fd(info.fd); return PTR_ERR(file); } if (copy_to_user((void __user *)arg, &info, sizeof(info))) { put_unused_fd(info.fd); fput(file); return -EFAULT; } fd_install(info.fd, file); err = 0; break; } default: err = -ENOIOCTLCMD; break; } return err; } static void free_mux(struct rcu_head *rcu) { struct kcm_mux *mux = container_of(rcu, struct kcm_mux, rcu); kmem_cache_free(kcm_muxp, mux); } static void release_mux(struct kcm_mux *mux) { struct kcm_net *knet = mux->knet; struct kcm_psock *psock, *tmp_psock; /* Release psocks */ list_for_each_entry_safe(psock, tmp_psock, &mux->psocks, psock_list) { if (!WARN_ON(psock->unattaching)) kcm_unattach(psock); } if (WARN_ON(mux->psocks_cnt)) return; __skb_queue_purge(&mux->rx_hold_queue); mutex_lock(&knet->mutex); aggregate_mux_stats(&mux->stats, &knet->aggregate_mux_stats); aggregate_psock_stats(&mux->aggregate_psock_stats, &knet->aggregate_psock_stats); aggregate_strp_stats(&mux->aggregate_strp_stats, &knet->aggregate_strp_stats); list_del_rcu(&mux->kcm_mux_list); knet->count--; mutex_unlock(&knet->mutex); call_rcu(&mux->rcu, free_mux); } static void kcm_done(struct kcm_sock *kcm) { struct kcm_mux *mux = kcm->mux; struct sock *sk = &kcm->sk; int socks_cnt; spin_lock_bh(&mux->rx_lock); if (kcm->rx_psock) { /* Cleanup in unreserve_rx_kcm */ WARN_ON(kcm->done); kcm->rx_disabled = 1; kcm->done = 1; spin_unlock_bh(&mux->rx_lock); return; } if (kcm->rx_wait) { list_del(&kcm->wait_rx_list); /* paired with lockless reads in kcm_rfree() */ WRITE_ONCE(kcm->rx_wait, false); } /* Move any pending receive messages to other kcm sockets */ requeue_rx_msgs(mux, &sk->sk_receive_queue); spin_unlock_bh(&mux->rx_lock); if (WARN_ON(sk_rmem_alloc_get(sk))) return; /* Detach from MUX */ spin_lock_bh(&mux->lock); list_del(&kcm->kcm_sock_list); mux->kcm_socks_cnt--; socks_cnt = mux->kcm_socks_cnt; spin_unlock_bh(&mux->lock); if (!socks_cnt) { /* We are done with the mux now. */ release_mux(mux); } WARN_ON(kcm->rx_wait); sock_put(&kcm->sk); } /* Called by kcm_release to close a KCM socket. * If this is the last KCM socket on the MUX, destroy the MUX. */ static int kcm_release(struct socket *sock) { struct sock *sk = sock->sk; struct kcm_sock *kcm; struct kcm_mux *mux; struct kcm_psock *psock; if (!sk) return 0; kcm = kcm_sk(sk); mux = kcm->mux; lock_sock(sk); sock_orphan(sk); kfree_skb(kcm->seq_skb); /* Purge queue under lock to avoid race condition with tx_work trying * to act when queue is nonempty. If tx_work runs after this point * it will just return. */ __skb_queue_purge(&sk->sk_write_queue); /* Set tx_stopped. This is checked when psock is bound to a kcm and we * get a writespace callback. This prevents further work being queued * from the callback (unbinding the psock occurs after canceling work. */ kcm->tx_stopped = 1; release_sock(sk); spin_lock_bh(&mux->lock); if (kcm->tx_wait) { /* Take of tx_wait list, after this point there should be no way * that a psock will be assigned to this kcm. */ list_del(&kcm->wait_psock_list); kcm->tx_wait = false; } spin_unlock_bh(&mux->lock); /* Cancel work. After this point there should be no outside references * to the kcm socket. */ cancel_work_sync(&kcm->tx_work); lock_sock(sk); psock = kcm->tx_psock; if (psock) { /* A psock was reserved, so we need to kill it since it * may already have some bytes queued from a message. We * need to do this after removing kcm from tx_wait list. */ kcm_abort_tx_psock(psock, EPIPE, false); unreserve_psock(kcm); } release_sock(sk); WARN_ON(kcm->tx_wait); WARN_ON(kcm->tx_psock); sock->sk = NULL; kcm_done(kcm); return 0; } static const struct proto_ops kcm_dgram_ops = { .family = PF_KCM, .owner = THIS_MODULE, .release = kcm_release, .bind = sock_no_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = datagram_poll, .ioctl = kcm_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = kcm_setsockopt, .getsockopt = kcm_getsockopt, .sendmsg = kcm_sendmsg, .recvmsg = kcm_recvmsg, .mmap = sock_no_mmap, .splice_eof = kcm_splice_eof, }; static const struct proto_ops kcm_seqpacket_ops = { .family = PF_KCM, .owner = THIS_MODULE, .release = kcm_release, .bind = sock_no_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = datagram_poll, .ioctl = kcm_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = kcm_setsockopt, .getsockopt = kcm_getsockopt, .sendmsg = kcm_sendmsg, .recvmsg = kcm_recvmsg, .mmap = sock_no_mmap, .splice_eof = kcm_splice_eof, .splice_read = kcm_splice_read, }; /* Create proto operation for kcm sockets */ static int kcm_create(struct net *net, struct socket *sock, int protocol, int kern) { struct kcm_net *knet = net_generic(net, kcm_net_id); struct sock *sk; struct kcm_mux *mux; switch (sock->type) { case SOCK_DGRAM: sock->ops = &kcm_dgram_ops; break; case SOCK_SEQPACKET: sock->ops = &kcm_seqpacket_ops; break; default: return -ESOCKTNOSUPPORT; } if (protocol != KCMPROTO_CONNECTED) return -EPROTONOSUPPORT; sk = sk_alloc(net, PF_KCM, GFP_KERNEL, &kcm_proto, kern); if (!sk) return -ENOMEM; /* Allocate a kcm mux, shared between KCM sockets */ mux = kmem_cache_zalloc(kcm_muxp, GFP_KERNEL); if (!mux) { sk_free(sk); return -ENOMEM; } spin_lock_init(&mux->lock); spin_lock_init(&mux->rx_lock); INIT_LIST_HEAD(&mux->kcm_socks); INIT_LIST_HEAD(&mux->kcm_rx_waiters); INIT_LIST_HEAD(&mux->kcm_tx_waiters); INIT_LIST_HEAD(&mux->psocks); INIT_LIST_HEAD(&mux->psocks_ready); INIT_LIST_HEAD(&mux->psocks_avail); mux->knet = knet; /* Add new MUX to list */ mutex_lock(&knet->mutex); list_add_rcu(&mux->kcm_mux_list, &knet->mux_list); knet->count++; mutex_unlock(&knet->mutex); skb_queue_head_init(&mux->rx_hold_queue); /* Init KCM socket */ sock_init_data(sock, sk); init_kcm_sock(kcm_sk(sk), mux); return 0; } static const struct net_proto_family kcm_family_ops = { .family = PF_KCM, .create = kcm_create, .owner = THIS_MODULE, }; static __net_init int kcm_init_net(struct net *net) { struct kcm_net *knet = net_generic(net, kcm_net_id); INIT_LIST_HEAD_RCU(&knet->mux_list); mutex_init(&knet->mutex); return 0; } static __net_exit void kcm_exit_net(struct net *net) { struct kcm_net *knet = net_generic(net, kcm_net_id); /* All KCM sockets should be closed at this point, which should mean * that all multiplexors and psocks have been destroyed. */ WARN_ON(!list_empty(&knet->mux_list)); mutex_destroy(&knet->mutex); } static struct pernet_operations kcm_net_ops = { .init = kcm_init_net, .exit = kcm_exit_net, .id = &kcm_net_id, .size = sizeof(struct kcm_net), }; static int __init kcm_init(void) { int err = -ENOMEM; kcm_muxp = KMEM_CACHE(kcm_mux, SLAB_HWCACHE_ALIGN); if (!kcm_muxp) goto fail; kcm_psockp = KMEM_CACHE(kcm_psock, SLAB_HWCACHE_ALIGN); if (!kcm_psockp) goto fail; kcm_wq = create_singlethread_workqueue("kkcmd"); if (!kcm_wq) goto fail; err = proto_register(&kcm_proto, 1); if (err) goto fail; err = register_pernet_device(&kcm_net_ops); if (err) goto net_ops_fail; err = sock_register(&kcm_family_ops); if (err) goto sock_register_fail; err = kcm_proc_init(); if (err) goto proc_init_fail; return 0; proc_init_fail: sock_unregister(PF_KCM); sock_register_fail: unregister_pernet_device(&kcm_net_ops); net_ops_fail: proto_unregister(&kcm_proto); fail: kmem_cache_destroy(kcm_muxp); kmem_cache_destroy(kcm_psockp); if (kcm_wq) destroy_workqueue(kcm_wq); return err; } static void __exit kcm_exit(void) { kcm_proc_exit(); sock_unregister(PF_KCM); unregister_pernet_device(&kcm_net_ops); proto_unregister(&kcm_proto); destroy_workqueue(kcm_wq); kmem_cache_destroy(kcm_muxp); kmem_cache_destroy(kcm_psockp); } module_init(kcm_init); module_exit(kcm_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("KCM (Kernel Connection Multiplexor) sockets"); MODULE_ALIAS_NETPROTO(PF_KCM); |
4 47 60 665 56 1 8 73 52 142 126 653 499 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_CGROUP_H #define _LINUX_CGROUP_H /* * cgroup interface * * Copyright (C) 2003 BULL SA * Copyright (C) 2004-2006 Silicon Graphics, Inc. * */ #include <linux/sched.h> #include <linux/cpumask.h> #include <linux/nodemask.h> #include <linux/rculist.h> #include <linux/cgroupstats.h> #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/kernfs.h> #include <linux/jump_label.h> #include <linux/types.h> #include <linux/ns_common.h> #include <linux/nsproxy.h> #include <linux/user_namespace.h> #include <linux/refcount.h> #include <linux/kernel_stat.h> #include <linux/cgroup-defs.h> struct kernel_clone_args; #ifdef CONFIG_CGROUPS /* * All weight knobs on the default hierarchy should use the following min, * default and max values. The default value is the logarithmic center of * MIN and MAX and allows 100x to be expressed in both directions. */ #define CGROUP_WEIGHT_MIN 1 #define CGROUP_WEIGHT_DFL 100 #define CGROUP_WEIGHT_MAX 10000 enum { CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */ CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */ CSS_TASK_ITER_SKIPPED = (1U << 16), /* internal flags */ }; /* a css_task_iter should be treated as an opaque object */ struct css_task_iter { struct cgroup_subsys *ss; unsigned int flags; struct list_head *cset_pos; struct list_head *cset_head; struct list_head *tcset_pos; struct list_head *tcset_head; struct list_head *task_pos; struct list_head *cur_tasks_head; struct css_set *cur_cset; struct css_set *cur_dcset; struct task_struct *cur_task; struct list_head iters_node; /* css_set->task_iters */ }; extern struct file_system_type cgroup_fs_type; extern struct cgroup_root cgrp_dfl_root; extern struct css_set init_css_set; extern spinlock_t css_set_lock; #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; #include <linux/cgroup_subsys.h> #undef SUBSYS #define SUBSYS(_x) \ extern struct static_key_true _x ## _cgrp_subsys_enabled_key; \ extern struct static_key_true _x ## _cgrp_subsys_on_dfl_key; #include <linux/cgroup_subsys.h> #undef SUBSYS /** * cgroup_subsys_enabled - fast test on whether a subsys is enabled * @ss: subsystem in question */ #define cgroup_subsys_enabled(ss) \ static_branch_likely(&ss ## _enabled_key) /** * cgroup_subsys_on_dfl - fast test on whether a subsys is on default hierarchy * @ss: subsystem in question */ #define cgroup_subsys_on_dfl(ss) \ static_branch_likely(&ss ## _on_dfl_key) bool css_has_online_children(struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup, struct cgroup_subsys *ss); struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, struct cgroup_subsys *ss); struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys *ss); struct cgroup *cgroup_get_from_path(const char *path); struct cgroup *cgroup_get_from_fd(int fd); struct cgroup *cgroup_v1v2_get_from_fd(int fd); int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cftype *cfts); void cgroup_file_notify(struct cgroup_file *cfile); void cgroup_file_show(struct cgroup_file *cfile, bool show); int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk); void cgroup_fork(struct task_struct *p); extern int cgroup_can_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void cgroup_cancel_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs); void cgroup_exit(struct task_struct *p); void cgroup_release(struct task_struct *p); void cgroup_free(struct task_struct *p); int cgroup_init_early(void); int cgroup_init(void); int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v); /* * Iteration helpers and macros. */ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *parent); struct cgroup_subsys_state *css_next_descendant_pre(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state *pos); struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *css); struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp); struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp); void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it); struct task_struct *css_task_iter_next(struct css_task_iter *it); void css_task_iter_end(struct css_task_iter *it); /** * css_for_each_child - iterate through children of a css * @pos: the css * to use as the loop cursor * @parent: css whose children to walk * * Walk @parent's children. Must be called under rcu_read_lock(). * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. * * It is allowed to temporarily drop RCU read lock during iteration. The * caller is responsible for ensuring that @pos remains accessible until * the start of the next iteration by, for example, bumping the css refcnt. */ #define css_for_each_child(pos, parent) \ for ((pos) = css_next_child(NULL, (parent)); (pos); \ (pos) = css_next_child((pos), (parent))) /** * css_for_each_descendant_pre - pre-order walk of a css's descendants * @pos: the css * to use as the loop cursor * @root: css whose descendants to walk * * Walk @root's descendants. @root is included in the iteration and the * first node to be visited. Must be called under rcu_read_lock(). * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. * * For example, the following guarantees that a descendant can't escape * state updates of its ancestors. * * my_online(@css) * { * Lock @css's parent and @css; * Inherit state from the parent; * Unlock both. * } * * my_update_state(@css) * { * css_for_each_descendant_pre(@pos, @css) { * Lock @pos; * if (@pos == @css) * Update @css's state; * else * Verify @pos is alive and inherit state from its parent; * Unlock @pos; * } * } * * As long as the inheriting step, including checking the parent state, is * enclosed inside @pos locking, double-locking the parent isn't necessary * while inheriting. The state update to the parent is guaranteed to be * visible by walking order and, as long as inheriting operations to the * same @pos are atomic to each other, multiple updates racing each other * still result in the correct state. It's guaranateed that at least one * inheritance happens for any css after the latest update to its parent. * * If checking parent's state requires locking the parent, each inheriting * iteration should lock and unlock both @pos->parent and @pos. * * Alternatively, a subsystem may choose to use a single global lock to * synchronize ->css_online() and ->css_offline() against tree-walking * operations. * * It is allowed to temporarily drop RCU read lock during iteration. The * caller is responsible for ensuring that @pos remains accessible until * the start of the next iteration by, for example, bumping the css refcnt. */ #define css_for_each_descendant_pre(pos, css) \ for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \ (pos) = css_next_descendant_pre((pos), (css))) /** * css_for_each_descendant_post - post-order walk of a css's descendants * @pos: the css * to use as the loop cursor * @css: css whose descendants to walk * * Similar to css_for_each_descendant_pre() but performs post-order * traversal instead. @root is included in the iteration and the last * node to be visited. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. * * Note that the walk visibility guarantee example described in pre-order * walk doesn't apply the same to post-order walks. */ #define css_for_each_descendant_post(pos, css) \ for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ (pos) = css_next_descendant_post((pos), (css))) /** * cgroup_taskset_for_each - iterate cgroup_taskset * @task: the loop cursor * @dst_css: the destination css * @tset: taskset to iterate * * @tset may contain multiple tasks and they may belong to multiple * processes. * * On the v2 hierarchy, there may be tasks from multiple processes and they * may not share the source or destination csses. * * On traditional hierarchies, when there are multiple tasks in @tset, if a * task of a process is in @tset, all tasks of the process are in @tset. * Also, all are guaranteed to share the same source and destination csses. * * Iteration is not in any specific order. */ #define cgroup_taskset_for_each(task, dst_css, tset) \ for ((task) = cgroup_taskset_first((tset), &(dst_css)); \ (task); \ (task) = cgroup_taskset_next((tset), &(dst_css))) /** * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset * @leader: the loop cursor * @dst_css: the destination css * @tset: taskset to iterate * * Iterate threadgroup leaders of @tset. For single-task migrations, @tset * may not contain any. */ #define cgroup_taskset_for_each_leader(leader, dst_css, tset) \ for ((leader) = cgroup_taskset_first((tset), &(dst_css)); \ (leader); \ (leader) = cgroup_taskset_next((tset), &(dst_css))) \ if ((leader) != (leader)->group_leader) \ ; \ else /* * Inline functions. */ #ifdef CONFIG_DEBUG_CGROUP_REF void css_get(struct cgroup_subsys_state *css); void css_get_many(struct cgroup_subsys_state *css, unsigned int n); bool css_tryget(struct cgroup_subsys_state *css); bool css_tryget_online(struct cgroup_subsys_state *css); void css_put(struct cgroup_subsys_state *css); void css_put_many(struct cgroup_subsys_state *css, unsigned int n); #else #define CGROUP_REF_FN_ATTRS static inline #define CGROUP_REF_EXPORT(fn) #include <linux/cgroup_refcnt.h> #endif static inline u64 cgroup_id(const struct cgroup *cgrp) { return cgrp->kn->id; } /** * css_is_dying - test whether the specified css is dying * @css: target css * * Test whether @css is in the process of offlining or already offline. In * most cases, ->css_online() and ->css_offline() callbacks should be * enough; however, the actual offline operations are RCU delayed and this * test returns %true also when @css is scheduled to be offlined. * * This is useful, for example, when the use case requires synchronous * behavior with respect to cgroup removal. cgroup removal schedules css * offlining but the css can seem alive while the operation is being * delayed. If the delay affects user visible semantics, this test can be * used to resolve the situation. */ static inline bool css_is_dying(struct cgroup_subsys_state *css) { return !(css->flags & CSS_NO_REF) && percpu_ref_is_dying(&css->refcnt); } static inline void cgroup_get(struct cgroup *cgrp) { css_get(&cgrp->self); } static inline bool cgroup_tryget(struct cgroup *cgrp) { return css_tryget(&cgrp->self); } static inline void cgroup_put(struct cgroup *cgrp) { css_put(&cgrp->self); } extern struct mutex cgroup_mutex; static inline void cgroup_lock(void) { mutex_lock(&cgroup_mutex); } static inline void cgroup_unlock(void) { mutex_unlock(&cgroup_mutex); } /** * task_css_set_check - obtain a task's css_set with extra access conditions * @task: the task to obtain css_set for * @__c: extra condition expression to be passed to rcu_dereference_check() * * A task's css_set is RCU protected, initialized and exited while holding * task_lock(), and can only be modified while holding both cgroup_mutex * and task_lock() while the task is alive. This macro verifies that the * caller is inside proper critical section and returns @task's css_set. * * The caller can also specify additional allowed conditions via @__c, such * as locks used during the cgroup_subsys::attach() methods. */ #ifdef CONFIG_PROVE_RCU #define task_css_set_check(task, __c) \ rcu_dereference_check((task)->cgroups, \ rcu_read_lock_sched_held() || \ lockdep_is_held(&cgroup_mutex) || \ lockdep_is_held(&css_set_lock) || \ ((task)->flags & PF_EXITING) || (__c)) #else #define task_css_set_check(task, __c) \ rcu_dereference((task)->cgroups) #endif /** * task_css_check - obtain css for (task, subsys) w/ extra access conds * @task: the target task * @subsys_id: the target subsystem ID * @__c: extra condition expression to be passed to rcu_dereference_check() * * Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The * synchronization rules are the same as task_css_set_check(). */ #define task_css_check(task, subsys_id, __c) \ task_css_set_check((task), (__c))->subsys[(subsys_id)] /** * task_css_set - obtain a task's css_set * @task: the task to obtain css_set for * * See task_css_set_check(). */ static inline struct css_set *task_css_set(struct task_struct *task) { return task_css_set_check(task, false); } /** * task_css - obtain css for (task, subsys) * @task: the target task * @subsys_id: the target subsystem ID * * See task_css_check(). */ static inline struct cgroup_subsys_state *task_css(struct task_struct *task, int subsys_id) { return task_css_check(task, subsys_id, false); } /** * task_get_css - find and get the css for (task, subsys) * @task: the target task * @subsys_id: the target subsystem ID * * Find the css for the (@task, @subsys_id) combination, increment a * reference on and return it. This function is guaranteed to return a * valid css. The returned css may already have been offlined. */ static inline struct cgroup_subsys_state * task_get_css(struct task_struct *task, int subsys_id) { struct cgroup_subsys_state *css; rcu_read_lock(); while (true) { css = task_css(task, subsys_id); /* * Can't use css_tryget_online() here. A task which has * PF_EXITING set may stay associated with an offline css. * If such task calls this function, css_tryget_online() * will keep failing. */ if (likely(css_tryget(css))) break; cpu_relax(); } rcu_read_unlock(); return css; } /** * task_css_is_root - test whether a task belongs to the root css * @task: the target task * @subsys_id: the target subsystem ID * * Test whether @task belongs to the root css on the specified subsystem. * May be invoked in any context. */ static inline bool task_css_is_root(struct task_struct *task, int subsys_id) { return task_css_check(task, subsys_id, true) == init_css_set.subsys[subsys_id]; } static inline struct cgroup *task_cgroup(struct task_struct *task, int subsys_id) { return task_css(task, subsys_id)->cgroup; } static inline struct cgroup *task_dfl_cgroup(struct task_struct *task) { return task_css_set(task)->dfl_cgrp; } static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) { struct cgroup_subsys_state *parent_css = cgrp->self.parent; if (parent_css) return container_of(parent_css, struct cgroup, self); return NULL; } /** * cgroup_is_descendant - test ancestry * @cgrp: the cgroup to be tested * @ancestor: possible ancestor of @cgrp * * Test whether @cgrp is a descendant of @ancestor. It also returns %true * if @cgrp == @ancestor. This function is safe to call as long as @cgrp * and @ancestor are accessible. */ static inline bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) { if (cgrp->root != ancestor->root || cgrp->level < ancestor->level) return false; return cgrp->ancestors[ancestor->level] == ancestor; } /** * cgroup_ancestor - find ancestor of cgroup * @cgrp: cgroup to find ancestor of * @ancestor_level: level of ancestor to find starting from root * * Find ancestor of cgroup at specified level starting from root if it exists * and return pointer to it. Return NULL if @cgrp doesn't have ancestor at * @ancestor_level. * * This function is safe to call as long as @cgrp is accessible. */ static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp, int ancestor_level) { if (ancestor_level < 0 || ancestor_level > cgrp->level) return NULL; return cgrp->ancestors[ancestor_level]; } /** * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry * @task: the task to be tested * @ancestor: possible ancestor of @task's cgroup * * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor. * It follows all the same rules as cgroup_is_descendant, and only applies * to the default hierarchy. */ static inline bool task_under_cgroup_hierarchy(struct task_struct *task, struct cgroup *ancestor) { struct css_set *cset = task_css_set(task); return cgroup_is_descendant(cset->dfl_cgrp, ancestor); } /* no synchronization, the result can only be used as a hint */ static inline bool cgroup_is_populated(struct cgroup *cgrp) { return cgrp->nr_populated_csets + cgrp->nr_populated_domain_children + cgrp->nr_populated_threaded_children; } /* returns ino associated with a cgroup */ static inline ino_t cgroup_ino(struct cgroup *cgrp) { return kernfs_ino(cgrp->kn); } /* cft/css accessors for cftype->write() operation */ static inline struct cftype *of_cft(struct kernfs_open_file *of) { return of->kn->priv; } struct cgroup_subsys_state *of_css(struct kernfs_open_file *of); /* cft/css accessors for cftype->seq_*() operations */ static inline struct cftype *seq_cft(struct seq_file *seq) { return of_cft(seq->private); } static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq) { return of_css(seq->private); } /* * Name / path handling functions. All are thin wrappers around the kernfs * counterparts and can be called under any context. */ static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen) { return kernfs_name(cgrp->kn, buf, buflen); } static inline int cgroup_path(struct cgroup *cgrp, char *buf, size_t buflen) { return kernfs_path(cgrp->kn, buf, buflen); } static inline void pr_cont_cgroup_name(struct cgroup *cgrp) { pr_cont_kernfs_name(cgrp->kn); } static inline void pr_cont_cgroup_path(struct cgroup *cgrp) { pr_cont_kernfs_path(cgrp->kn); } bool cgroup_psi_enabled(void); static inline void cgroup_init_kthreadd(void) { /* * kthreadd is inherited by all kthreads, keep it in the root so * that the new kthreads are guaranteed to stay in the root until * initialization is finished. */ current->no_cgroup_migration = 1; } static inline void cgroup_kthread_ready(void) { /* * This kthread finished initialization. The creator should have * set PF_NO_SETAFFINITY if this kthread should stay in the root. */ current->no_cgroup_migration = 0; } void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen); struct cgroup *cgroup_get_from_id(u64 id); #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; struct cgroup; static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; } static inline void css_get(struct cgroup_subsys_state *css) {} static inline void css_put(struct cgroup_subsys_state *css) {} static inline void cgroup_lock(void) {} static inline void cgroup_unlock(void) {} static inline int cgroup_attach_task_all(struct task_struct *from, struct task_struct *t) { return 0; } static inline int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { return -EINVAL; } static inline void cgroup_fork(struct task_struct *p) {} static inline int cgroup_can_fork(struct task_struct *p, struct kernel_clone_args *kargs) { return 0; } static inline void cgroup_cancel_fork(struct task_struct *p, struct kernel_clone_args *kargs) {} static inline void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) {} static inline void cgroup_exit(struct task_struct *p) {} static inline void cgroup_release(struct task_struct *p) {} static inline void cgroup_free(struct task_struct *p) {} static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } static inline void cgroup_init_kthreadd(void) {} static inline void cgroup_kthread_ready(void) {} static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) { return NULL; } static inline bool cgroup_psi_enabled(void) { return false; } static inline bool task_under_cgroup_hierarchy(struct task_struct *task, struct cgroup *ancestor) { return true; } static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) {} #endif /* !CONFIG_CGROUPS */ #ifdef CONFIG_CGROUPS /* * cgroup scalable recursive statistics. */ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu); void cgroup_rstat_flush(struct cgroup *cgrp); void cgroup_rstat_flush_hold(struct cgroup *cgrp); void cgroup_rstat_flush_release(void); /* * Basic resource stats. */ #ifdef CONFIG_CGROUP_CPUACCT void cpuacct_charge(struct task_struct *tsk, u64 cputime); void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); #else static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} static inline void cpuacct_account_field(struct task_struct *tsk, int index, u64 val) {} #endif void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec); void __cgroup_account_cputime_field(struct cgroup *cgrp, enum cpu_usage_stat index, u64 delta_exec); static inline void cgroup_account_cputime(struct task_struct *task, u64 delta_exec) { struct cgroup *cgrp; cpuacct_charge(task, delta_exec); cgrp = task_dfl_cgroup(task); if (cgroup_parent(cgrp)) __cgroup_account_cputime(cgrp, delta_exec); } static inline void cgroup_account_cputime_field(struct task_struct *task, enum cpu_usage_stat index, u64 delta_exec) { struct cgroup *cgrp; cpuacct_account_field(task, index, delta_exec); cgrp = task_dfl_cgroup(task); if (cgroup_parent(cgrp)) __cgroup_account_cputime_field(cgrp, index, delta_exec); } #else /* CONFIG_CGROUPS */ static inline void cgroup_account_cputime(struct task_struct *task, u64 delta_exec) {} static inline void cgroup_account_cputime_field(struct task_struct *task, enum cpu_usage_stat index, u64 delta_exec) {} #endif /* CONFIG_CGROUPS */ /* * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data * definition in cgroup-defs.h. */ #ifdef CONFIG_SOCK_CGROUP_DATA void cgroup_sk_alloc(struct sock_cgroup_data *skcd); void cgroup_sk_clone(struct sock_cgroup_data *skcd); void cgroup_sk_free(struct sock_cgroup_data *skcd); static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) { return skcd->cgroup; } #else /* CONFIG_CGROUP_DATA */ static inline void cgroup_sk_alloc(struct sock_cgroup_data *skcd) {} static inline void cgroup_sk_clone(struct sock_cgroup_data *skcd) {} static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {} #endif /* CONFIG_CGROUP_DATA */ struct cgroup_namespace { struct ns_common ns; struct user_namespace *user_ns; struct ucounts *ucounts; struct css_set *root_cset; }; extern struct cgroup_namespace init_cgroup_ns; #ifdef CONFIG_CGROUPS void free_cgroup_ns(struct cgroup_namespace *ns); struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns); int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns); #else /* !CONFIG_CGROUPS */ static inline void free_cgroup_ns(struct cgroup_namespace *ns) { } static inline struct cgroup_namespace * copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { return old_ns; } #endif /* !CONFIG_CGROUPS */ static inline void get_cgroup_ns(struct cgroup_namespace *ns) { if (ns) refcount_inc(&ns->ns.count); } static inline void put_cgroup_ns(struct cgroup_namespace *ns) { if (ns && refcount_dec_and_test(&ns->ns.count)) free_cgroup_ns(ns); } #ifdef CONFIG_CGROUPS void cgroup_enter_frozen(void); void cgroup_leave_frozen(bool always_leave); void cgroup_update_frozen(struct cgroup *cgrp); void cgroup_freeze(struct cgroup *cgrp, bool freeze); void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src, struct cgroup *dst); static inline bool cgroup_task_frozen(struct task_struct *task) { return task->frozen; } #else /* !CONFIG_CGROUPS */ static inline void cgroup_enter_frozen(void) { } static inline void cgroup_leave_frozen(bool always_leave) { } static inline bool cgroup_task_frozen(struct task_struct *task) { return false; } #endif /* !CONFIG_CGROUPS */ #ifdef CONFIG_CGROUP_BPF static inline void cgroup_bpf_get(struct cgroup *cgrp) { percpu_ref_get(&cgrp->bpf.refcnt); } static inline void cgroup_bpf_put(struct cgroup *cgrp) { percpu_ref_put(&cgrp->bpf.refcnt); } #else /* CONFIG_CGROUP_BPF */ static inline void cgroup_bpf_get(struct cgroup *cgrp) {} static inline void cgroup_bpf_put(struct cgroup *cgrp) {} #endif /* CONFIG_CGROUP_BPF */ struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id); #endif /* _LINUX_CGROUP_H */ |
7 188 207 40 164 58 150 26 208 203 47 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * sha256_base.h - core logic for SHA-256 implementations * * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org> */ #ifndef _CRYPTO_SHA256_BASE_H #define _CRYPTO_SHA256_BASE_H #include <asm/byteorder.h> #include <asm/unaligned.h> #include <crypto/internal/hash.h> #include <crypto/sha2.h> #include <linux/string.h> #include <linux/types.h> typedef void (sha256_block_fn)(struct sha256_state *sst, u8 const *src, int blocks); static inline int sha224_base_init(struct shash_desc *desc) { struct sha256_state *sctx = shash_desc_ctx(desc); sha224_init(sctx); return 0; } static inline int sha256_base_init(struct shash_desc *desc) { struct sha256_state *sctx = shash_desc_ctx(desc); sha256_init(sctx); return 0; } static inline int lib_sha256_base_do_update(struct sha256_state *sctx, const u8 *data, unsigned int len, sha256_block_fn *block_fn) { unsigned int partial = sctx->count % SHA256_BLOCK_SIZE; sctx->count += len; if (unlikely((partial + len) >= SHA256_BLOCK_SIZE)) { int blocks; if (partial) { int p = SHA256_BLOCK_SIZE - partial; memcpy(sctx->buf + partial, data, p); data += p; len -= p; block_fn(sctx, sctx->buf, 1); } blocks = len / SHA256_BLOCK_SIZE; len %= SHA256_BLOCK_SIZE; if (blocks) { block_fn(sctx, data, blocks); data += blocks * SHA256_BLOCK_SIZE; } partial = 0; } if (len) memcpy(sctx->buf + partial, data, len); return 0; } static inline int sha256_base_do_update(struct shash_desc *desc, const u8 *data, unsigned int len, sha256_block_fn *block_fn) { struct sha256_state *sctx = shash_desc_ctx(desc); return lib_sha256_base_do_update(sctx, data, len, block_fn); } static inline int lib_sha256_base_do_finalize(struct sha256_state *sctx, sha256_block_fn *block_fn) { const int bit_offset = SHA256_BLOCK_SIZE - sizeof(__be64); __be64 *bits = (__be64 *)(sctx->buf + bit_offset); unsigned int partial = sctx->count % SHA256_BLOCK_SIZE; sctx->buf[partial++] = 0x80; if (partial > bit_offset) { memset(sctx->buf + partial, 0x0, SHA256_BLOCK_SIZE - partial); partial = 0; block_fn(sctx, sctx->buf, 1); } memset(sctx->buf + partial, 0x0, bit_offset - partial); *bits = cpu_to_be64(sctx->count << 3); block_fn(sctx, sctx->buf, 1); return 0; } static inline int sha256_base_do_finalize(struct shash_desc *desc, sha256_block_fn *block_fn) { struct sha256_state *sctx = shash_desc_ctx(desc); return lib_sha256_base_do_finalize(sctx, block_fn); } static inline int lib_sha256_base_finish(struct sha256_state *sctx, u8 *out, unsigned int digest_size) { __be32 *digest = (__be32 *)out; int i; for (i = 0; digest_size > 0; i++, digest_size -= sizeof(__be32)) put_unaligned_be32(sctx->state[i], digest++); memzero_explicit(sctx, sizeof(*sctx)); return 0; } static inline int sha256_base_finish(struct shash_desc *desc, u8 *out) { unsigned int digest_size = crypto_shash_digestsize(desc->tfm); struct sha256_state *sctx = shash_desc_ctx(desc); return lib_sha256_base_finish(sctx, out, digest_size); } #endif /* _CRYPTO_SHA256_BASE_H */ |
5 5 5 4 5 8 2 2 2 2 2 2 2 1 3 3 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include <net/genetlink.h> #include <net/sock.h> #include <trace/events/devlink.h> #include "devl_internal.h" struct devlink_fmsg_item { struct list_head list; int attrtype; u8 nla_type; u16 len; int value[]; }; struct devlink_fmsg { struct list_head item_list; int err; /* first error encountered on some devlink_fmsg_XXX() call */ bool putting_binary; /* This flag forces enclosing of binary data * in an array brackets. It forces using * of designated API: * devlink_fmsg_binary_pair_nest_start() * devlink_fmsg_binary_pair_nest_end() */ }; static struct devlink_fmsg *devlink_fmsg_alloc(void) { struct devlink_fmsg *fmsg; fmsg = kzalloc(sizeof(*fmsg), GFP_KERNEL); if (!fmsg) return NULL; INIT_LIST_HEAD(&fmsg->item_list); return fmsg; } static void devlink_fmsg_free(struct devlink_fmsg *fmsg) { struct devlink_fmsg_item *item, *tmp; list_for_each_entry_safe(item, tmp, &fmsg->item_list, list) { list_del(&item->list); kfree(item); } kfree(fmsg); } struct devlink_health_reporter { struct list_head list; void *priv; const struct devlink_health_reporter_ops *ops; struct devlink *devlink; struct devlink_port *devlink_port; struct devlink_fmsg *dump_fmsg; u64 graceful_period; bool auto_recover; bool auto_dump; u8 health_state; u64 dump_ts; u64 dump_real_ts; u64 error_count; u64 recovery_count; u64 last_recovery_ts; }; void * devlink_health_reporter_priv(struct devlink_health_reporter *reporter) { return reporter->priv; } EXPORT_SYMBOL_GPL(devlink_health_reporter_priv); static struct devlink_health_reporter * __devlink_health_reporter_find_by_name(struct list_head *reporter_list, const char *reporter_name) { struct devlink_health_reporter *reporter; list_for_each_entry(reporter, reporter_list, list) if (!strcmp(reporter->ops->name, reporter_name)) return reporter; return NULL; } static struct devlink_health_reporter * devlink_health_reporter_find_by_name(struct devlink *devlink, const char *reporter_name) { return __devlink_health_reporter_find_by_name(&devlink->reporter_list, reporter_name); } static struct devlink_health_reporter * devlink_port_health_reporter_find_by_name(struct devlink_port *devlink_port, const char *reporter_name) { return __devlink_health_reporter_find_by_name(&devlink_port->reporter_list, reporter_name); } static struct devlink_health_reporter * __devlink_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, u64 graceful_period, void *priv) { struct devlink_health_reporter *reporter; if (WARN_ON(graceful_period && !ops->recover)) return ERR_PTR(-EINVAL); reporter = kzalloc(sizeof(*reporter), GFP_KERNEL); if (!reporter) return ERR_PTR(-ENOMEM); reporter->priv = priv; reporter->ops = ops; reporter->devlink = devlink; reporter->graceful_period = graceful_period; reporter->auto_recover = !!ops->recover; reporter->auto_dump = !!ops->dump; return reporter; } /** * devl_port_health_reporter_create() - create devlink health reporter for * specified port instance * * @port: devlink_port to which health reports will relate * @ops: devlink health reporter ops * @graceful_period: min time (in msec) between recovery attempts * @priv: driver priv pointer */ struct devlink_health_reporter * devl_port_health_reporter_create(struct devlink_port *port, const struct devlink_health_reporter_ops *ops, u64 graceful_period, void *priv) { struct devlink_health_reporter *reporter; devl_assert_locked(port->devlink); if (__devlink_health_reporter_find_by_name(&port->reporter_list, ops->name)) return ERR_PTR(-EEXIST); reporter = __devlink_health_reporter_create(port->devlink, ops, graceful_period, priv); if (IS_ERR(reporter)) return reporter; reporter->devlink_port = port; list_add_tail(&reporter->list, &port->reporter_list); return reporter; } EXPORT_SYMBOL_GPL(devl_port_health_reporter_create); struct devlink_health_reporter * devlink_port_health_reporter_create(struct devlink_port *port, const struct devlink_health_reporter_ops *ops, u64 graceful_period, void *priv) { struct devlink_health_reporter *reporter; struct devlink *devlink = port->devlink; devl_lock(devlink); reporter = devl_port_health_reporter_create(port, ops, graceful_period, priv); devl_unlock(devlink); return reporter; } EXPORT_SYMBOL_GPL(devlink_port_health_reporter_create); /** * devl_health_reporter_create - create devlink health reporter * * @devlink: devlink instance which the health reports will relate * @ops: devlink health reporter ops * @graceful_period: min time (in msec) between recovery attempts * @priv: driver priv pointer */ struct devlink_health_reporter * devl_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, u64 graceful_period, void *priv) { struct devlink_health_reporter *reporter; devl_assert_locked(devlink); if (devlink_health_reporter_find_by_name(devlink, ops->name)) return ERR_PTR(-EEXIST); reporter = __devlink_health_reporter_create(devlink, ops, graceful_period, priv); if (IS_ERR(reporter)) return reporter; list_add_tail(&reporter->list, &devlink->reporter_list); return reporter; } EXPORT_SYMBOL_GPL(devl_health_reporter_create); struct devlink_health_reporter * devlink_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, u64 graceful_period, void *priv) { struct devlink_health_reporter *reporter; devl_lock(devlink); reporter = devl_health_reporter_create(devlink, ops, graceful_period, priv); devl_unlock(devlink); return reporter; } EXPORT_SYMBOL_GPL(devlink_health_reporter_create); static void devlink_health_reporter_free(struct devlink_health_reporter *reporter) { if (reporter->dump_fmsg) devlink_fmsg_free(reporter->dump_fmsg); kfree(reporter); } /** * devl_health_reporter_destroy() - destroy devlink health reporter * * @reporter: devlink health reporter to destroy */ void devl_health_reporter_destroy(struct devlink_health_reporter *reporter) { devl_assert_locked(reporter->devlink); list_del(&reporter->list); devlink_health_reporter_free(reporter); } EXPORT_SYMBOL_GPL(devl_health_reporter_destroy); void devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) { struct devlink *devlink = reporter->devlink; devl_lock(devlink); devl_health_reporter_destroy(reporter); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy); static int devlink_nl_health_reporter_fill(struct sk_buff *msg, struct devlink_health_reporter *reporter, enum devlink_command cmd, u32 portid, u32 seq, int flags) { struct devlink *devlink = reporter->devlink; struct nlattr *reporter_attr; void *hdr; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto genlmsg_cancel; if (reporter->devlink_port) { if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, reporter->devlink_port->index)) goto genlmsg_cancel; } reporter_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_HEALTH_REPORTER); if (!reporter_attr) goto genlmsg_cancel; if (nla_put_string(msg, DEVLINK_ATTR_HEALTH_REPORTER_NAME, reporter->ops->name)) goto reporter_nest_cancel; if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE, reporter->health_state)) goto reporter_nest_cancel; if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR_COUNT, reporter->error_count, DEVLINK_ATTR_PAD)) goto reporter_nest_cancel; if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER_COUNT, reporter->recovery_count, DEVLINK_ATTR_PAD)) goto reporter_nest_cancel; if (reporter->ops->recover && nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, reporter->graceful_period, DEVLINK_ATTR_PAD)) goto reporter_nest_cancel; if (reporter->ops->recover && nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, reporter->auto_recover)) goto reporter_nest_cancel; if (reporter->dump_fmsg && nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS, jiffies_to_msecs(reporter->dump_ts), DEVLINK_ATTR_PAD)) goto reporter_nest_cancel; if (reporter->dump_fmsg && nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS, reporter->dump_real_ts, DEVLINK_ATTR_PAD)) goto reporter_nest_cancel; if (reporter->ops->dump && nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, reporter->auto_dump)) goto reporter_nest_cancel; nla_nest_end(msg, reporter_attr); genlmsg_end(msg, hdr); return 0; reporter_nest_cancel: nla_nest_cancel(msg, reporter_attr); genlmsg_cancel: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static struct devlink_health_reporter * devlink_health_reporter_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) { struct devlink_port *devlink_port; char *reporter_name; if (!attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]) return NULL; reporter_name = nla_data(attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]); devlink_port = devlink_port_get_from_attrs(devlink, attrs); if (IS_ERR(devlink_port)) return devlink_health_reporter_find_by_name(devlink, reporter_name); else return devlink_port_health_reporter_find_by_name(devlink_port, reporter_name); } static struct devlink_health_reporter * devlink_health_reporter_get_from_info(struct devlink *devlink, struct genl_info *info) { return devlink_health_reporter_get_from_attrs(devlink, info->attrs); } int devlink_nl_health_reporter_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; struct sk_buff *msg; int err; reporter = devlink_health_reporter_get_from_info(devlink, info); if (!reporter) return -EINVAL; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_health_reporter_fill(msg, reporter, DEVLINK_CMD_HEALTH_REPORTER_GET, info->snd_portid, info->snd_seq, 0); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } static int devlink_nl_health_reporter_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); const struct genl_info *info = genl_info_dump(cb); struct devlink_health_reporter *reporter; unsigned long port_index_end = ULONG_MAX; struct nlattr **attrs = info->attrs; unsigned long port_index_start = 0; struct devlink_port *port; unsigned long port_index; int idx = 0; int err; if (attrs && attrs[DEVLINK_ATTR_PORT_INDEX]) { port_index_start = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]); port_index_end = port_index_start; flags |= NLM_F_DUMP_FILTERED; goto per_port_dump; } list_for_each_entry(reporter, &devlink->reporter_list, list) { if (idx < state->idx) { idx++; continue; } err = devlink_nl_health_reporter_fill(msg, reporter, DEVLINK_CMD_HEALTH_REPORTER_GET, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags); if (err) { state->idx = idx; return err; } idx++; } per_port_dump: xa_for_each_range(&devlink->ports, port_index, port, port_index_start, port_index_end) { list_for_each_entry(reporter, &port->reporter_list, list) { if (idx < state->idx) { idx++; continue; } err = devlink_nl_health_reporter_fill(msg, reporter, DEVLINK_CMD_HEALTH_REPORTER_GET, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags); if (err) { state->idx = idx; return err; } idx++; } } return 0; } int devlink_nl_health_reporter_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_health_reporter_get_dump_one); } int devlink_nl_health_reporter_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; reporter = devlink_health_reporter_get_from_info(devlink, info); if (!reporter) return -EINVAL; if (!reporter->ops->recover && (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] || info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])) return -EOPNOTSUPP; if (!reporter->ops->dump && info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) return -EOPNOTSUPP; if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) reporter->graceful_period = nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]); if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]) reporter->auto_recover = nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]); if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) reporter->auto_dump = nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]); return 0; } static void devlink_recover_notify(struct devlink_health_reporter *reporter, enum devlink_command cmd) { struct devlink *devlink = reporter->devlink; struct devlink_obj_desc desc; struct sk_buff *msg; int err; WARN_ON(cmd != DEVLINK_CMD_HEALTH_REPORTER_RECOVER); ASSERT_DEVLINK_REGISTERED(devlink); if (!devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_health_reporter_fill(msg, reporter, cmd, 0, 0, 0); if (err) { nlmsg_free(msg); return; } devlink_nl_obj_desc_init(&desc, devlink); if (reporter->devlink_port) devlink_nl_obj_desc_port_set(&desc, reporter->devlink_port); devlink_nl_notify_send_desc(devlink, msg, &desc); } void devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter) { reporter->recovery_count++; reporter->last_recovery_ts = jiffies; } EXPORT_SYMBOL_GPL(devlink_health_reporter_recovery_done); static int devlink_health_reporter_recover(struct devlink_health_reporter *reporter, void *priv_ctx, struct netlink_ext_ack *extack) { int err; if (reporter->health_state == DEVLINK_HEALTH_REPORTER_STATE_HEALTHY) return 0; if (!reporter->ops->recover) return -EOPNOTSUPP; err = reporter->ops->recover(reporter, priv_ctx, extack); if (err) return err; devlink_health_reporter_recovery_done(reporter); reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY; devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); return 0; } static void devlink_health_dump_clear(struct devlink_health_reporter *reporter) { if (!reporter->dump_fmsg) return; devlink_fmsg_free(reporter->dump_fmsg); reporter->dump_fmsg = NULL; } static int devlink_health_do_dump(struct devlink_health_reporter *reporter, void *priv_ctx, struct netlink_ext_ack *extack) { int err; if (!reporter->ops->dump) return 0; if (reporter->dump_fmsg) return 0; reporter->dump_fmsg = devlink_fmsg_alloc(); if (!reporter->dump_fmsg) return -ENOMEM; devlink_fmsg_obj_nest_start(reporter->dump_fmsg); err = reporter->ops->dump(reporter, reporter->dump_fmsg, priv_ctx, extack); if (err) goto dump_err; devlink_fmsg_obj_nest_end(reporter->dump_fmsg); err = reporter->dump_fmsg->err; if (err) goto dump_err; reporter->dump_ts = jiffies; reporter->dump_real_ts = ktime_get_real_ns(); return 0; dump_err: devlink_health_dump_clear(reporter); return err; } int devlink_health_report(struct devlink_health_reporter *reporter, const char *msg, void *priv_ctx) { enum devlink_health_reporter_state prev_health_state; struct devlink *devlink = reporter->devlink; unsigned long recover_ts_threshold; int ret; /* write a log message of the current error */ WARN_ON(!msg); trace_devlink_health_report(devlink, reporter->ops->name, msg); reporter->error_count++; prev_health_state = reporter->health_state; reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR; devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); /* abort if the previous error wasn't recovered */ recover_ts_threshold = reporter->last_recovery_ts + msecs_to_jiffies(reporter->graceful_period); if (reporter->auto_recover && (prev_health_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY || (reporter->last_recovery_ts && reporter->recovery_count && time_is_after_jiffies(recover_ts_threshold)))) { trace_devlink_health_recover_aborted(devlink, reporter->ops->name, reporter->health_state, jiffies - reporter->last_recovery_ts); return -ECANCELED; } if (reporter->auto_dump) { devl_lock(devlink); /* store current dump of current error, for later analysis */ devlink_health_do_dump(reporter, priv_ctx, NULL); devl_unlock(devlink); } if (!reporter->auto_recover) return 0; devl_lock(devlink); ret = devlink_health_reporter_recover(reporter, priv_ctx, NULL); devl_unlock(devlink); return ret; } EXPORT_SYMBOL_GPL(devlink_health_report); void devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, enum devlink_health_reporter_state state) { if (WARN_ON(state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY && state != DEVLINK_HEALTH_REPORTER_STATE_ERROR)) return; if (reporter->health_state == state) return; reporter->health_state = state; trace_devlink_health_reporter_state_update(reporter->devlink, reporter->ops->name, state); devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); } EXPORT_SYMBOL_GPL(devlink_health_reporter_state_update); int devlink_nl_health_reporter_recover_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; reporter = devlink_health_reporter_get_from_info(devlink, info); if (!reporter) return -EINVAL; return devlink_health_reporter_recover(reporter, NULL, info->extack); } static void devlink_fmsg_err_if_binary(struct devlink_fmsg *fmsg) { if (!fmsg->err && fmsg->putting_binary) fmsg->err = -EINVAL; } static void devlink_fmsg_nest_common(struct devlink_fmsg *fmsg, int attrtype) { struct devlink_fmsg_item *item; if (fmsg->err) return; item = kzalloc(sizeof(*item), GFP_KERNEL); if (!item) { fmsg->err = -ENOMEM; return; } item->attrtype = attrtype; list_add_tail(&item->list, &fmsg->item_list); } void devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg) { devlink_fmsg_err_if_binary(fmsg); devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_OBJ_NEST_START); } EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_start); static void devlink_fmsg_nest_end(struct devlink_fmsg *fmsg) { devlink_fmsg_err_if_binary(fmsg); devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_NEST_END); } void devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg) { devlink_fmsg_nest_end(fmsg); } EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_end); #define DEVLINK_FMSG_MAX_SIZE (GENLMSG_DEFAULT_SIZE - GENL_HDRLEN - NLA_HDRLEN) static void devlink_fmsg_put_name(struct devlink_fmsg *fmsg, const char *name) { struct devlink_fmsg_item *item; devlink_fmsg_err_if_binary(fmsg); if (fmsg->err) return; if (strlen(name) + 1 > DEVLINK_FMSG_MAX_SIZE) { fmsg->err = -EMSGSIZE; return; } item = kzalloc(sizeof(*item) + strlen(name) + 1, GFP_KERNEL); if (!item) { fmsg->err = -ENOMEM; return; } item->nla_type = NLA_NUL_STRING; item->len = strlen(name) + 1; item->attrtype = DEVLINK_ATTR_FMSG_OBJ_NAME; memcpy(&item->value, name, item->len); list_add_tail(&item->list, &fmsg->item_list); } void devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name) { devlink_fmsg_err_if_binary(fmsg); devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_PAIR_NEST_START); devlink_fmsg_put_name(fmsg, name); } EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_start); void devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg) { devlink_fmsg_nest_end(fmsg); } EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_end); void devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg, const char *name) { devlink_fmsg_pair_nest_start(fmsg, name); devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_ARR_NEST_START); } EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_start); void devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg) { devlink_fmsg_nest_end(fmsg); devlink_fmsg_nest_end(fmsg); } EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_end); void devlink_fmsg_binary_pair_nest_start(struct devlink_fmsg *fmsg, const char *name) { devlink_fmsg_arr_pair_nest_start(fmsg, name); fmsg->putting_binary = true; } EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_nest_start); void devlink_fmsg_binary_pair_nest_end(struct devlink_fmsg *fmsg) { if (fmsg->err) return; if (!fmsg->putting_binary) fmsg->err = -EINVAL; fmsg->putting_binary = false; devlink_fmsg_arr_pair_nest_end(fmsg); } EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_nest_end); static void devlink_fmsg_put_value(struct devlink_fmsg *fmsg, const void *value, u16 value_len, u8 value_nla_type) { struct devlink_fmsg_item *item; if (fmsg->err) return; if (value_len > DEVLINK_FMSG_MAX_SIZE) { fmsg->err = -EMSGSIZE; return; } item = kzalloc(sizeof(*item) + value_len, GFP_KERNEL); if (!item) { fmsg->err = -ENOMEM; return; } item->nla_type = value_nla_type; item->len = value_len; item->attrtype = DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA; memcpy(&item->value, value, item->len); list_add_tail(&item->list, &fmsg->item_list); } static void devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value) { devlink_fmsg_err_if_binary(fmsg); devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_FLAG); } static void devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value) { devlink_fmsg_err_if_binary(fmsg); devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U8); } void devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value) { devlink_fmsg_err_if_binary(fmsg); devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U32); } EXPORT_SYMBOL_GPL(devlink_fmsg_u32_put); static void devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value) { devlink_fmsg_err_if_binary(fmsg); devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U64); } void devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value) { devlink_fmsg_err_if_binary(fmsg); devlink_fmsg_put_value(fmsg, value, strlen(value) + 1, NLA_NUL_STRING); } EXPORT_SYMBOL_GPL(devlink_fmsg_string_put); void devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, u16 value_len) { if (!fmsg->err && !fmsg->putting_binary) fmsg->err = -EINVAL; devlink_fmsg_put_value(fmsg, value, value_len, NLA_BINARY); } EXPORT_SYMBOL_GPL(devlink_fmsg_binary_put); void devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name, bool value) { devlink_fmsg_pair_nest_start(fmsg, name); devlink_fmsg_bool_put(fmsg, value); devlink_fmsg_pair_nest_end(fmsg); } EXPORT_SYMBOL_GPL(devlink_fmsg_bool_pair_put); void devlink_fmsg_u8_pair_put(struct devlink_fmsg *fmsg, const char *name, u8 value) { devlink_fmsg_pair_nest_start(fmsg, name); devlink_fmsg_u8_put(fmsg, value); devlink_fmsg_pair_nest_end(fmsg); } EXPORT_SYMBOL_GPL(devlink_fmsg_u8_pair_put); void devlink_fmsg_u32_pair_put(struct devlink_fmsg *fmsg, const char *name, u32 value) { devlink_fmsg_pair_nest_start(fmsg, name); devlink_fmsg_u32_put(fmsg, value); devlink_fmsg_pair_nest_end(fmsg); } EXPORT_SYMBOL_GPL(devlink_fmsg_u32_pair_put); void devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name, u64 value) { devlink_fmsg_pair_nest_start(fmsg, name); devlink_fmsg_u64_put(fmsg, value); devlink_fmsg_pair_nest_end(fmsg); } EXPORT_SYMBOL_GPL(devlink_fmsg_u64_pair_put); void devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name, const char *value) { devlink_fmsg_pair_nest_start(fmsg, name); devlink_fmsg_string_put(fmsg, value); devlink_fmsg_pair_nest_end(fmsg); } EXPORT_SYMBOL_GPL(devlink_fmsg_string_pair_put); void devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, const void *value, u32 value_len) { u32 data_size; u32 offset; devlink_fmsg_binary_pair_nest_start(fmsg, name); for (offset = 0; offset < value_len; offset += data_size) { data_size = value_len - offset; if (data_size > DEVLINK_FMSG_MAX_SIZE) data_size = DEVLINK_FMSG_MAX_SIZE; devlink_fmsg_binary_put(fmsg, value + offset, data_size); } devlink_fmsg_binary_pair_nest_end(fmsg); fmsg->putting_binary = false; } EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_put); static int devlink_fmsg_item_fill_type(struct devlink_fmsg_item *msg, struct sk_buff *skb) { switch (msg->nla_type) { case NLA_FLAG: case NLA_U8: case NLA_U32: case NLA_U64: case NLA_NUL_STRING: case NLA_BINARY: return nla_put_u8(skb, DEVLINK_ATTR_FMSG_OBJ_VALUE_TYPE, msg->nla_type); default: return -EINVAL; } } static int devlink_fmsg_item_fill_data(struct devlink_fmsg_item *msg, struct sk_buff *skb) { int attrtype = DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA; u8 tmp; switch (msg->nla_type) { case NLA_FLAG: /* Always provide flag data, regardless of its value */ tmp = *(bool *)msg->value; return nla_put_u8(skb, attrtype, tmp); case NLA_U8: return nla_put_u8(skb, attrtype, *(u8 *)msg->value); case NLA_U32: return nla_put_u32(skb, attrtype, *(u32 *)msg->value); case NLA_U64: return nla_put_u64_64bit(skb, attrtype, *(u64 *)msg->value, DEVLINK_ATTR_PAD); case NLA_NUL_STRING: return nla_put_string(skb, attrtype, (char *)&msg->value); case NLA_BINARY: return nla_put(skb, attrtype, msg->len, (void *)&msg->value); default: return -EINVAL; } } static int devlink_fmsg_prepare_skb(struct devlink_fmsg *fmsg, struct sk_buff *skb, int *start) { struct devlink_fmsg_item *item; struct nlattr *fmsg_nlattr; int err = 0; int i = 0; fmsg_nlattr = nla_nest_start_noflag(skb, DEVLINK_ATTR_FMSG); if (!fmsg_nlattr) return -EMSGSIZE; list_for_each_entry(item, &fmsg->item_list, list) { if (i < *start) { i++; continue; } switch (item->attrtype) { case DEVLINK_ATTR_FMSG_OBJ_NEST_START: case DEVLINK_ATTR_FMSG_PAIR_NEST_START: case DEVLINK_ATTR_FMSG_ARR_NEST_START: case DEVLINK_ATTR_FMSG_NEST_END: err = nla_put_flag(skb, item->attrtype); break; case DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA: err = devlink_fmsg_item_fill_type(item, skb); if (err) break; err = devlink_fmsg_item_fill_data(item, skb); break; case DEVLINK_ATTR_FMSG_OBJ_NAME: err = nla_put_string(skb, item->attrtype, (char *)&item->value); break; default: err = -EINVAL; break; } if (!err) *start = ++i; else break; } nla_nest_end(skb, fmsg_nlattr); return err; } static int devlink_fmsg_snd(struct devlink_fmsg *fmsg, struct genl_info *info, enum devlink_command cmd, int flags) { struct nlmsghdr *nlh; struct sk_buff *skb; bool last = false; int index = 0; void *hdr; int err; if (fmsg->err) return fmsg->err; while (!last) { int tmp_index = index; skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return -ENOMEM; hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, &devlink_nl_family, flags | NLM_F_MULTI, cmd); if (!hdr) { err = -EMSGSIZE; goto nla_put_failure; } err = devlink_fmsg_prepare_skb(fmsg, skb, &index); if (!err) last = true; else if (err != -EMSGSIZE || tmp_index == index) goto nla_put_failure; genlmsg_end(skb, hdr); err = genlmsg_reply(skb, info); if (err) return err; } skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return -ENOMEM; nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, NLMSG_DONE, 0, flags | NLM_F_MULTI); if (!nlh) { err = -EMSGSIZE; goto nla_put_failure; } return genlmsg_reply(skb, info); nla_put_failure: nlmsg_free(skb); return err; } static int devlink_fmsg_dumpit(struct devlink_fmsg *fmsg, struct sk_buff *skb, struct netlink_callback *cb, enum devlink_command cmd) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); int index = state->idx; int tmp_index = index; void *hdr; int err; if (fmsg->err) return fmsg->err; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI, cmd); if (!hdr) { err = -EMSGSIZE; goto nla_put_failure; } err = devlink_fmsg_prepare_skb(fmsg, skb, &index); if ((err && err != -EMSGSIZE) || tmp_index == index) goto nla_put_failure; state->idx = index; genlmsg_end(skb, hdr); return skb->len; nla_put_failure: genlmsg_cancel(skb, hdr); return err; } int devlink_nl_health_reporter_diagnose_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; struct devlink_fmsg *fmsg; int err; reporter = devlink_health_reporter_get_from_info(devlink, info); if (!reporter) return -EINVAL; if (!reporter->ops->diagnose) return -EOPNOTSUPP; fmsg = devlink_fmsg_alloc(); if (!fmsg) return -ENOMEM; devlink_fmsg_obj_nest_start(fmsg); err = reporter->ops->diagnose(reporter, fmsg, info->extack); if (err) goto out; devlink_fmsg_obj_nest_end(fmsg); err = devlink_fmsg_snd(fmsg, info, DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, 0); out: devlink_fmsg_free(fmsg); return err; } static struct devlink_health_reporter * devlink_health_reporter_get_from_cb_lock(struct netlink_callback *cb) { const struct genl_info *info = genl_info_dump(cb); struct devlink_health_reporter *reporter; struct nlattr **attrs = info->attrs; struct devlink *devlink; devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs, false); if (IS_ERR(devlink)) return NULL; reporter = devlink_health_reporter_get_from_attrs(devlink, attrs); if (!reporter) { devl_unlock(devlink); devlink_put(devlink); } return reporter; } int devlink_nl_health_reporter_dump_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_health_reporter *reporter; struct devlink *devlink; int err; reporter = devlink_health_reporter_get_from_cb_lock(cb); if (!reporter) return -EINVAL; devlink = reporter->devlink; if (!reporter->ops->dump) { devl_unlock(devlink); devlink_put(devlink); return -EOPNOTSUPP; } if (!state->idx) { err = devlink_health_do_dump(reporter, NULL, cb->extack); if (err) goto unlock; state->dump_ts = reporter->dump_ts; } if (!reporter->dump_fmsg || state->dump_ts != reporter->dump_ts) { NL_SET_ERR_MSG(cb->extack, "Dump trampled, please retry"); err = -EAGAIN; goto unlock; } err = devlink_fmsg_dumpit(reporter->dump_fmsg, skb, cb, DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET); unlock: devl_unlock(devlink); devlink_put(devlink); return err; } int devlink_nl_health_reporter_dump_clear_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; reporter = devlink_health_reporter_get_from_info(devlink, info); if (!reporter) return -EINVAL; if (!reporter->ops->dump) return -EOPNOTSUPP; devlink_health_dump_clear(reporter); return 0; } int devlink_nl_health_reporter_test_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; reporter = devlink_health_reporter_get_from_info(devlink, info); if (!reporter) return -EINVAL; if (!reporter->ops->test) return -EOPNOTSUPP; return reporter->ops->test(reporter, info->extack); } |
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 1 1 1 3 2 2 2 1 1 3 3 3 3 3 3 3 4 1 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 | // SPDX-License-Identifier: GPL-2.0-only /* * Driver for the VoIP USB phones with CM109 chipsets. * * Copyright (C) 2007 - 2008 Alfred E. Heggestad <aeh@db.org> */ /* * Tested devices: * - Komunikate KIP1000 * - Genius G-talk * - Allied-Telesis Corega USBPH01 * - ... * * This driver is based on the yealink.c driver * * Thanks to: * - Authors of yealink.c * - Thomas Reitmayr * - Oliver Neukum for good review comments and code * - Shaun Jackman <sjackman@gmail.com> for Genius G-talk keymap * - Dmitry Torokhov for valuable input and review * * Todo: * - Read/write EEPROM */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/rwsem.h> #include <linux/usb/input.h> #define DRIVER_VERSION "20080805" #define DRIVER_AUTHOR "Alfred E. Heggestad" #define DRIVER_DESC "CM109 phone driver" static char *phone = "kip1000"; module_param(phone, charp, S_IRUSR); MODULE_PARM_DESC(phone, "Phone name {kip1000, gtalk, usbph01, atcom}"); enum { /* HID Registers */ HID_IR0 = 0x00, /* Record/Playback-mute button, Volume up/down */ HID_IR1 = 0x01, /* GPI, generic registers or EEPROM_DATA0 */ HID_IR2 = 0x02, /* Generic registers or EEPROM_DATA1 */ HID_IR3 = 0x03, /* Generic registers or EEPROM_CTRL */ HID_OR0 = 0x00, /* Mapping control, buzzer, SPDIF (offset 0x04) */ HID_OR1 = 0x01, /* GPO - General Purpose Output */ HID_OR2 = 0x02, /* Set GPIO to input/output mode */ HID_OR3 = 0x03, /* SPDIF status channel or EEPROM_CTRL */ /* HID_IR0 */ RECORD_MUTE = 1 << 3, PLAYBACK_MUTE = 1 << 2, VOLUME_DOWN = 1 << 1, VOLUME_UP = 1 << 0, /* HID_OR0 */ /* bits 7-6 0: HID_OR1-2 are used for GPO; HID_OR0, 3 are used for buzzer and SPDIF 1: HID_OR0-3 are used as generic HID registers 2: Values written to HID_OR0-3 are also mapped to MCU_CTRL, EEPROM_DATA0-1, EEPROM_CTRL (see Note) 3: Reserved */ HID_OR_GPO_BUZ_SPDIF = 0 << 6, HID_OR_GENERIC_HID_REG = 1 << 6, HID_OR_MAP_MCU_EEPROM = 2 << 6, BUZZER_ON = 1 << 5, /* up to 256 normal keys, up to 15 special key combinations */ KEYMAP_SIZE = 256 + 15, }; /* CM109 protocol packet */ struct cm109_ctl_packet { u8 byte[4]; } __attribute__ ((packed)); enum { USB_PKT_LEN = sizeof(struct cm109_ctl_packet) }; /* CM109 device structure */ struct cm109_dev { struct input_dev *idev; /* input device */ struct usb_device *udev; /* usb device */ struct usb_interface *intf; /* irq input channel */ struct cm109_ctl_packet *irq_data; dma_addr_t irq_dma; struct urb *urb_irq; /* control output channel */ struct cm109_ctl_packet *ctl_data; dma_addr_t ctl_dma; struct usb_ctrlrequest *ctl_req; struct urb *urb_ctl; /* * The 3 bitfields below are protected by ctl_submit_lock. * They have to be separate since they are accessed from IRQ * context. */ unsigned irq_urb_pending:1; /* irq_urb is in flight */ unsigned ctl_urb_pending:1; /* ctl_urb is in flight */ unsigned buzzer_pending:1; /* need to issue buzz command */ spinlock_t ctl_submit_lock; unsigned char buzzer_state; /* on/off */ /* flags */ unsigned open:1; unsigned resetting:1; unsigned shutdown:1; /* This mutex protects writes to the above flags */ struct mutex pm_mutex; unsigned short keymap[KEYMAP_SIZE]; char phys[64]; /* physical device path */ int key_code; /* last reported key */ int keybit; /* 0=new scan 1,2,4,8=scan columns */ u8 gpi; /* Cached value of GPI (high nibble) */ }; /****************************************************************************** * CM109 key interface *****************************************************************************/ static unsigned short special_keymap(int code) { if (code > 0xff) { switch (code - 0xff) { case RECORD_MUTE: return KEY_MICMUTE; case PLAYBACK_MUTE: return KEY_MUTE; case VOLUME_DOWN: return KEY_VOLUMEDOWN; case VOLUME_UP: return KEY_VOLUMEUP; } } return KEY_RESERVED; } /* Map device buttons to internal key events. * * The "up" and "down" keys, are symbolised by arrows on the button. * The "pickup" and "hangup" keys are symbolised by a green and red phone * on the button. Komunikate KIP1000 Keyboard Matrix -> -- 1 -- 2 -- 3 --> GPI pin 4 (0x10) | | | | <- -- 4 -- 5 -- 6 --> GPI pin 5 (0x20) | | | | END - 7 -- 8 -- 9 --> GPI pin 6 (0x40) | | | | OK -- * -- 0 -- # --> GPI pin 7 (0x80) | | | | /|\ /|\ /|\ /|\ | | | | GPO pin: 3 2 1 0 0x8 0x4 0x2 0x1 */ static unsigned short keymap_kip1000(int scancode) { switch (scancode) { /* phone key: */ case 0x82: return KEY_NUMERIC_0; /* 0 */ case 0x14: return KEY_NUMERIC_1; /* 1 */ case 0x12: return KEY_NUMERIC_2; /* 2 */ case 0x11: return KEY_NUMERIC_3; /* 3 */ case 0x24: return KEY_NUMERIC_4; /* 4 */ case 0x22: return KEY_NUMERIC_5; /* 5 */ case 0x21: return KEY_NUMERIC_6; /* 6 */ case 0x44: return KEY_NUMERIC_7; /* 7 */ case 0x42: return KEY_NUMERIC_8; /* 8 */ case 0x41: return KEY_NUMERIC_9; /* 9 */ case 0x81: return KEY_NUMERIC_POUND; /* # */ case 0x84: return KEY_NUMERIC_STAR; /* * */ case 0x88: return KEY_ENTER; /* pickup */ case 0x48: return KEY_ESC; /* hangup */ case 0x28: return KEY_LEFT; /* IN */ case 0x18: return KEY_RIGHT; /* OUT */ default: return special_keymap(scancode); } } /* Contributed by Shaun Jackman <sjackman@gmail.com> Genius G-Talk keyboard matrix 0 1 2 3 4: 0 4 8 Talk 5: 1 5 9 End 6: 2 6 # Up 7: 3 7 * Down */ static unsigned short keymap_gtalk(int scancode) { switch (scancode) { case 0x11: return KEY_NUMERIC_0; case 0x21: return KEY_NUMERIC_1; case 0x41: return KEY_NUMERIC_2; case 0x81: return KEY_NUMERIC_3; case 0x12: return KEY_NUMERIC_4; case 0x22: return KEY_NUMERIC_5; case 0x42: return KEY_NUMERIC_6; case 0x82: return KEY_NUMERIC_7; case 0x14: return KEY_NUMERIC_8; case 0x24: return KEY_NUMERIC_9; case 0x44: return KEY_NUMERIC_POUND; /* # */ case 0x84: return KEY_NUMERIC_STAR; /* * */ case 0x18: return KEY_ENTER; /* Talk (green handset) */ case 0x28: return KEY_ESC; /* End (red handset) */ case 0x48: return KEY_UP; /* Menu up (rocker switch) */ case 0x88: return KEY_DOWN; /* Menu down (rocker switch) */ default: return special_keymap(scancode); } } /* * Keymap for Allied-Telesis Corega USBPH01 * http://www.alliedtelesis-corega.com/2/1344/1437/1360/chprd.html * * Contributed by july@nat.bg */ static unsigned short keymap_usbph01(int scancode) { switch (scancode) { case 0x11: return KEY_NUMERIC_0; /* 0 */ case 0x21: return KEY_NUMERIC_1; /* 1 */ case 0x41: return KEY_NUMERIC_2; /* 2 */ case 0x81: return KEY_NUMERIC_3; /* 3 */ case 0x12: return KEY_NUMERIC_4; /* 4 */ case 0x22: return KEY_NUMERIC_5; /* 5 */ case 0x42: return KEY_NUMERIC_6; /* 6 */ case 0x82: return KEY_NUMERIC_7; /* 7 */ case 0x14: return KEY_NUMERIC_8; /* 8 */ case 0x24: return KEY_NUMERIC_9; /* 9 */ case 0x44: return KEY_NUMERIC_POUND; /* # */ case 0x84: return KEY_NUMERIC_STAR; /* * */ case 0x18: return KEY_ENTER; /* pickup */ case 0x28: return KEY_ESC; /* hangup */ case 0x48: return KEY_LEFT; /* IN */ case 0x88: return KEY_RIGHT; /* OUT */ default: return special_keymap(scancode); } } /* * Keymap for ATCom AU-100 * http://www.atcom.cn/products.html * http://www.packetizer.com/products/au100/ * http://www.voip-info.org/wiki/view/AU-100 * * Contributed by daniel@gimpelevich.san-francisco.ca.us */ static unsigned short keymap_atcom(int scancode) { switch (scancode) { /* phone key: */ case 0x82: return KEY_NUMERIC_0; /* 0 */ case 0x11: return KEY_NUMERIC_1; /* 1 */ case 0x12: return KEY_NUMERIC_2; /* 2 */ case 0x14: return KEY_NUMERIC_3; /* 3 */ case 0x21: return KEY_NUMERIC_4; /* 4 */ case 0x22: return KEY_NUMERIC_5; /* 5 */ case 0x24: return KEY_NUMERIC_6; /* 6 */ case 0x41: return KEY_NUMERIC_7; /* 7 */ case 0x42: return KEY_NUMERIC_8; /* 8 */ case 0x44: return KEY_NUMERIC_9; /* 9 */ case 0x84: return KEY_NUMERIC_POUND; /* # */ case 0x81: return KEY_NUMERIC_STAR; /* * */ case 0x18: return KEY_ENTER; /* pickup */ case 0x28: return KEY_ESC; /* hangup */ case 0x48: return KEY_LEFT; /* left arrow */ case 0x88: return KEY_RIGHT; /* right arrow */ default: return special_keymap(scancode); } } static unsigned short (*keymap)(int) = keymap_kip1000; /* * Completes a request by converting the data into events for the * input subsystem. */ static void report_key(struct cm109_dev *dev, int key) { struct input_dev *idev = dev->idev; if (dev->key_code >= 0) { /* old key up */ input_report_key(idev, dev->key_code, 0); } dev->key_code = key; if (key >= 0) { /* new valid key */ input_report_key(idev, key, 1); } input_sync(idev); } /* * Converts data of special key presses (volume, mute) into events * for the input subsystem, sends press-n-release for mute keys. */ static void cm109_report_special(struct cm109_dev *dev) { static const u8 autorelease = RECORD_MUTE | PLAYBACK_MUTE; struct input_dev *idev = dev->idev; u8 data = dev->irq_data->byte[HID_IR0]; unsigned short keycode; int i; for (i = 0; i < 4; i++) { keycode = dev->keymap[0xff + BIT(i)]; if (keycode == KEY_RESERVED) continue; input_report_key(idev, keycode, data & BIT(i)); if (data & autorelease & BIT(i)) { input_sync(idev); input_report_key(idev, keycode, 0); } } input_sync(idev); } /****************************************************************************** * CM109 usb communication interface *****************************************************************************/ static void cm109_submit_buzz_toggle(struct cm109_dev *dev) { int error; if (dev->buzzer_state) dev->ctl_data->byte[HID_OR0] |= BUZZER_ON; else dev->ctl_data->byte[HID_OR0] &= ~BUZZER_ON; error = usb_submit_urb(dev->urb_ctl, GFP_ATOMIC); if (error) dev_err(&dev->intf->dev, "%s: usb_submit_urb (urb_ctl) failed %d\n", __func__, error); } /* * IRQ handler */ static void cm109_urb_irq_callback(struct urb *urb) { struct cm109_dev *dev = urb->context; const int status = urb->status; int error; unsigned long flags; dev_dbg(&dev->intf->dev, "### URB IRQ: [0x%02x 0x%02x 0x%02x 0x%02x] keybit=0x%02x\n", dev->irq_data->byte[0], dev->irq_data->byte[1], dev->irq_data->byte[2], dev->irq_data->byte[3], dev->keybit); if (status) { if (status == -ESHUTDOWN) return; dev_err_ratelimited(&dev->intf->dev, "%s: urb status %d\n", __func__, status); goto out; } /* Special keys */ cm109_report_special(dev); /* Scan key column */ if (dev->keybit == 0xf) { /* Any changes ? */ if ((dev->gpi & 0xf0) == (dev->irq_data->byte[HID_IR1] & 0xf0)) goto out; dev->gpi = dev->irq_data->byte[HID_IR1] & 0xf0; dev->keybit = 0x1; } else { report_key(dev, dev->keymap[dev->irq_data->byte[HID_IR1]]); dev->keybit <<= 1; if (dev->keybit > 0x8) dev->keybit = 0xf; } out: spin_lock_irqsave(&dev->ctl_submit_lock, flags); dev->irq_urb_pending = 0; if (likely(!dev->shutdown)) { if (dev->buzzer_state) dev->ctl_data->byte[HID_OR0] |= BUZZER_ON; else dev->ctl_data->byte[HID_OR0] &= ~BUZZER_ON; dev->ctl_data->byte[HID_OR1] = dev->keybit; dev->ctl_data->byte[HID_OR2] = dev->keybit; dev->buzzer_pending = 0; dev->ctl_urb_pending = 1; error = usb_submit_urb(dev->urb_ctl, GFP_ATOMIC); if (error) dev_err(&dev->intf->dev, "%s: usb_submit_urb (urb_ctl) failed %d\n", __func__, error); } spin_unlock_irqrestore(&dev->ctl_submit_lock, flags); } static void cm109_urb_ctl_callback(struct urb *urb) { struct cm109_dev *dev = urb->context; const int status = urb->status; int error; unsigned long flags; dev_dbg(&dev->intf->dev, "### URB CTL: [0x%02x 0x%02x 0x%02x 0x%02x]\n", dev->ctl_data->byte[0], dev->ctl_data->byte[1], dev->ctl_data->byte[2], dev->ctl_data->byte[3]); if (status) { if (status == -ESHUTDOWN) return; dev_err_ratelimited(&dev->intf->dev, "%s: urb status %d\n", __func__, status); } spin_lock_irqsave(&dev->ctl_submit_lock, flags); dev->ctl_urb_pending = 0; if (likely(!dev->shutdown)) { if (dev->buzzer_pending || status) { dev->buzzer_pending = 0; dev->ctl_urb_pending = 1; cm109_submit_buzz_toggle(dev); } else if (likely(!dev->irq_urb_pending)) { /* ask for key data */ dev->irq_urb_pending = 1; error = usb_submit_urb(dev->urb_irq, GFP_ATOMIC); if (error) dev_err(&dev->intf->dev, "%s: usb_submit_urb (urb_irq) failed %d\n", __func__, error); } } spin_unlock_irqrestore(&dev->ctl_submit_lock, flags); } static void cm109_toggle_buzzer_async(struct cm109_dev *dev) { unsigned long flags; spin_lock_irqsave(&dev->ctl_submit_lock, flags); if (dev->ctl_urb_pending) { /* URB completion will resubmit */ dev->buzzer_pending = 1; } else { dev->ctl_urb_pending = 1; cm109_submit_buzz_toggle(dev); } spin_unlock_irqrestore(&dev->ctl_submit_lock, flags); } static void cm109_toggle_buzzer_sync(struct cm109_dev *dev, int on) { int error; if (on) dev->ctl_data->byte[HID_OR0] |= BUZZER_ON; else dev->ctl_data->byte[HID_OR0] &= ~BUZZER_ON; error = usb_control_msg(dev->udev, usb_sndctrlpipe(dev->udev, 0), dev->ctl_req->bRequest, dev->ctl_req->bRequestType, le16_to_cpu(dev->ctl_req->wValue), le16_to_cpu(dev->ctl_req->wIndex), dev->ctl_data, USB_PKT_LEN, USB_CTRL_SET_TIMEOUT); if (error < 0 && error != -EINTR) dev_err(&dev->intf->dev, "%s: usb_control_msg() failed %d\n", __func__, error); } static void cm109_stop_traffic(struct cm109_dev *dev) { dev->shutdown = 1; /* * Make sure other CPUs see this */ smp_wmb(); usb_kill_urb(dev->urb_ctl); usb_kill_urb(dev->urb_irq); cm109_toggle_buzzer_sync(dev, 0); dev->shutdown = 0; smp_wmb(); } static void cm109_restore_state(struct cm109_dev *dev) { if (dev->open) { /* * Restore buzzer state. * This will also kick regular URB submission */ cm109_toggle_buzzer_async(dev); } } /****************************************************************************** * input event interface *****************************************************************************/ static int cm109_input_open(struct input_dev *idev) { struct cm109_dev *dev = input_get_drvdata(idev); int error; error = usb_autopm_get_interface(dev->intf); if (error < 0) { dev_err(&idev->dev, "%s - cannot autoresume, result %d\n", __func__, error); return error; } mutex_lock(&dev->pm_mutex); dev->buzzer_state = 0; dev->key_code = -1; /* no keys pressed */ dev->keybit = 0xf; /* issue INIT */ dev->ctl_data->byte[HID_OR0] = HID_OR_GPO_BUZ_SPDIF; dev->ctl_data->byte[HID_OR1] = dev->keybit; dev->ctl_data->byte[HID_OR2] = dev->keybit; dev->ctl_data->byte[HID_OR3] = 0x00; dev->ctl_urb_pending = 1; error = usb_submit_urb(dev->urb_ctl, GFP_KERNEL); if (error) { dev->ctl_urb_pending = 0; dev_err(&dev->intf->dev, "%s: usb_submit_urb (urb_ctl) failed %d\n", __func__, error); } else { dev->open = 1; } mutex_unlock(&dev->pm_mutex); if (error) usb_autopm_put_interface(dev->intf); return error; } static void cm109_input_close(struct input_dev *idev) { struct cm109_dev *dev = input_get_drvdata(idev); mutex_lock(&dev->pm_mutex); /* * Once we are here event delivery is stopped so we * don't need to worry about someone starting buzzer * again */ cm109_stop_traffic(dev); dev->open = 0; mutex_unlock(&dev->pm_mutex); usb_autopm_put_interface(dev->intf); } static int cm109_input_ev(struct input_dev *idev, unsigned int type, unsigned int code, int value) { struct cm109_dev *dev = input_get_drvdata(idev); dev_dbg(&dev->intf->dev, "input_ev: type=%u code=%u value=%d\n", type, code, value); if (type != EV_SND) return -EINVAL; switch (code) { case SND_TONE: case SND_BELL: dev->buzzer_state = !!value; if (!dev->resetting) cm109_toggle_buzzer_async(dev); return 0; default: return -EINVAL; } } /****************************************************************************** * Linux interface and usb initialisation *****************************************************************************/ struct driver_info { char *name; }; static const struct driver_info info_cm109 = { .name = "CM109 USB driver", }; enum { VENDOR_ID = 0x0d8c, /* C-Media Electronics */ PRODUCT_ID_CM109 = 0x000e, /* CM109 defines range 0x0008 - 0x000f */ }; /* table of devices that work with this driver */ static const struct usb_device_id cm109_usb_table[] = { { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = VENDOR_ID, .idProduct = PRODUCT_ID_CM109, .bInterfaceClass = USB_CLASS_HID, .bInterfaceSubClass = 0, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t) &info_cm109 }, /* you can add more devices here with product ID 0x0008 - 0x000f */ { } }; static void cm109_usb_cleanup(struct cm109_dev *dev) { kfree(dev->ctl_req); usb_free_coherent(dev->udev, USB_PKT_LEN, dev->ctl_data, dev->ctl_dma); usb_free_coherent(dev->udev, USB_PKT_LEN, dev->irq_data, dev->irq_dma); usb_free_urb(dev->urb_irq); /* parameter validation in core/urb */ usb_free_urb(dev->urb_ctl); /* parameter validation in core/urb */ kfree(dev); } static void cm109_usb_disconnect(struct usb_interface *interface) { struct cm109_dev *dev = usb_get_intfdata(interface); usb_set_intfdata(interface, NULL); input_unregister_device(dev->idev); cm109_usb_cleanup(dev); } static int cm109_usb_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_device *udev = interface_to_usbdev(intf); struct driver_info *nfo = (struct driver_info *)id->driver_info; struct usb_host_interface *interface; struct usb_endpoint_descriptor *endpoint; struct cm109_dev *dev; struct input_dev *input_dev = NULL; int ret, pipe, i; int error = -ENOMEM; interface = intf->cur_altsetting; if (interface->desc.bNumEndpoints < 1) return -ENODEV; endpoint = &interface->endpoint[0].desc; if (!usb_endpoint_is_int_in(endpoint)) return -ENODEV; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return -ENOMEM; spin_lock_init(&dev->ctl_submit_lock); mutex_init(&dev->pm_mutex); dev->udev = udev; dev->intf = intf; dev->idev = input_dev = input_allocate_device(); if (!input_dev) goto err_out; /* allocate usb buffers */ dev->irq_data = usb_alloc_coherent(udev, USB_PKT_LEN, GFP_KERNEL, &dev->irq_dma); if (!dev->irq_data) goto err_out; dev->ctl_data = usb_alloc_coherent(udev, USB_PKT_LEN, GFP_KERNEL, &dev->ctl_dma); if (!dev->ctl_data) goto err_out; dev->ctl_req = kmalloc(sizeof(*(dev->ctl_req)), GFP_KERNEL); if (!dev->ctl_req) goto err_out; /* allocate urb structures */ dev->urb_irq = usb_alloc_urb(0, GFP_KERNEL); if (!dev->urb_irq) goto err_out; dev->urb_ctl = usb_alloc_urb(0, GFP_KERNEL); if (!dev->urb_ctl) goto err_out; /* get a handle to the interrupt data pipe */ pipe = usb_rcvintpipe(udev, endpoint->bEndpointAddress); ret = usb_maxpacket(udev, pipe); if (ret != USB_PKT_LEN) dev_err(&intf->dev, "invalid payload size %d, expected %d\n", ret, USB_PKT_LEN); /* initialise irq urb */ usb_fill_int_urb(dev->urb_irq, udev, pipe, dev->irq_data, USB_PKT_LEN, cm109_urb_irq_callback, dev, endpoint->bInterval); dev->urb_irq->transfer_dma = dev->irq_dma; dev->urb_irq->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; dev->urb_irq->dev = udev; /* initialise ctl urb */ dev->ctl_req->bRequestType = USB_TYPE_CLASS | USB_RECIP_INTERFACE | USB_DIR_OUT; dev->ctl_req->bRequest = USB_REQ_SET_CONFIGURATION; dev->ctl_req->wValue = cpu_to_le16(0x200); dev->ctl_req->wIndex = cpu_to_le16(interface->desc.bInterfaceNumber); dev->ctl_req->wLength = cpu_to_le16(USB_PKT_LEN); usb_fill_control_urb(dev->urb_ctl, udev, usb_sndctrlpipe(udev, 0), (void *)dev->ctl_req, dev->ctl_data, USB_PKT_LEN, cm109_urb_ctl_callback, dev); dev->urb_ctl->transfer_dma = dev->ctl_dma; dev->urb_ctl->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; dev->urb_ctl->dev = udev; /* find out the physical bus location */ usb_make_path(udev, dev->phys, sizeof(dev->phys)); strlcat(dev->phys, "/input0", sizeof(dev->phys)); /* register settings for the input device */ input_dev->name = nfo->name; input_dev->phys = dev->phys; usb_to_input_id(udev, &input_dev->id); input_dev->dev.parent = &intf->dev; input_set_drvdata(input_dev, dev); input_dev->open = cm109_input_open; input_dev->close = cm109_input_close; input_dev->event = cm109_input_ev; input_dev->keycode = dev->keymap; input_dev->keycodesize = sizeof(unsigned char); input_dev->keycodemax = ARRAY_SIZE(dev->keymap); input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_SND); input_dev->sndbit[0] = BIT_MASK(SND_BELL) | BIT_MASK(SND_TONE); /* register available key events */ for (i = 0; i < KEYMAP_SIZE; i++) { unsigned short k = keymap(i); dev->keymap[i] = k; __set_bit(k, input_dev->keybit); } __clear_bit(KEY_RESERVED, input_dev->keybit); error = input_register_device(dev->idev); if (error) goto err_out; usb_set_intfdata(intf, dev); return 0; err_out: input_free_device(input_dev); cm109_usb_cleanup(dev); return error; } static int cm109_usb_suspend(struct usb_interface *intf, pm_message_t message) { struct cm109_dev *dev = usb_get_intfdata(intf); dev_info(&intf->dev, "cm109: usb_suspend (event=%d)\n", message.event); mutex_lock(&dev->pm_mutex); cm109_stop_traffic(dev); mutex_unlock(&dev->pm_mutex); return 0; } static int cm109_usb_resume(struct usb_interface *intf) { struct cm109_dev *dev = usb_get_intfdata(intf); dev_info(&intf->dev, "cm109: usb_resume\n"); mutex_lock(&dev->pm_mutex); cm109_restore_state(dev); mutex_unlock(&dev->pm_mutex); return 0; } static int cm109_usb_pre_reset(struct usb_interface *intf) { struct cm109_dev *dev = usb_get_intfdata(intf); mutex_lock(&dev->pm_mutex); /* * Make sure input events don't try to toggle buzzer * while we are resetting */ dev->resetting = 1; smp_wmb(); cm109_stop_traffic(dev); return 0; } static int cm109_usb_post_reset(struct usb_interface *intf) { struct cm109_dev *dev = usb_get_intfdata(intf); dev->resetting = 0; smp_wmb(); cm109_restore_state(dev); mutex_unlock(&dev->pm_mutex); return 0; } static struct usb_driver cm109_driver = { .name = "cm109", .probe = cm109_usb_probe, .disconnect = cm109_usb_disconnect, .suspend = cm109_usb_suspend, .resume = cm109_usb_resume, .reset_resume = cm109_usb_resume, .pre_reset = cm109_usb_pre_reset, .post_reset = cm109_usb_post_reset, .id_table = cm109_usb_table, .supports_autosuspend = 1, }; static int __init cm109_select_keymap(void) { /* Load the phone keymap */ if (!strcasecmp(phone, "kip1000")) { keymap = keymap_kip1000; printk(KERN_INFO KBUILD_MODNAME ": " "Keymap for Komunikate KIP1000 phone loaded\n"); } else if (!strcasecmp(phone, "gtalk")) { keymap = keymap_gtalk; printk(KERN_INFO KBUILD_MODNAME ": " "Keymap for Genius G-talk phone loaded\n"); } else if (!strcasecmp(phone, "usbph01")) { keymap = keymap_usbph01; printk(KERN_INFO KBUILD_MODNAME ": " "Keymap for Allied-Telesis Corega USBPH01 phone loaded\n"); } else if (!strcasecmp(phone, "atcom")) { keymap = keymap_atcom; printk(KERN_INFO KBUILD_MODNAME ": " "Keymap for ATCom AU-100 phone loaded\n"); } else { printk(KERN_ERR KBUILD_MODNAME ": " "Unsupported phone: %s\n", phone); return -EINVAL; } return 0; } static int __init cm109_init(void) { int err; err = cm109_select_keymap(); if (err) return err; err = usb_register(&cm109_driver); if (err) return err; printk(KERN_INFO KBUILD_MODNAME ": " DRIVER_DESC ": " DRIVER_VERSION " (C) " DRIVER_AUTHOR "\n"); return 0; } static void __exit cm109_exit(void) { usb_deregister(&cm109_driver); } module_init(cm109_init); module_exit(cm109_exit); MODULE_DEVICE_TABLE(usb, cm109_usb_table); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL"); |
193 99 56 92 97 117 1 93 92 3 3 3 3 60 74 82 36 144 2 1 145 38 5 138 95 38 2 95 94 145 1 142 27 3 3 93 4 89 4 140 4 1 1 4 4 218 10 216 21 132 227 227 216 9 143 132 57 88 124 226 5 181 93 226 24 219 3 26 3 25 227 227 83 10 39 20 19 93 88 93 89 1 92 88 82 26 93 75 18 93 38 1 38 1 38 1 39 39 39 39 162 156 7 36 126 133 25 5 160 160 11 1 10 2 1026 995 66 11 66 25 66 18 70 23 61 159 159 159 159 6 6 5 170 165 18 7 173 188 123 175 172 189 93 142 20 187 4 185 187 181 18 7 145 144 93 180 8 188 186 93 145 169 88 189 189 187 189 184 18 132 173 16 16 31 82 3 3 3 3 3 3 3 3 22 2 5 5 5 5 5 5 5 5 5 3 5 4 4 4 4 1 4 4 4 4 1 4 66 104 108 65 117 4 121 121 5 117 114 4 121 1 3 99 9 99 25 81 88 4 29 68 1 16 4 51 81 2 41 5 100 22 14 12 19 6 106 70 35 37 35 25 93 23 22 23 21 23 23 2 18 12 4 2 3 1 104 93 6 3 4 1 14 93 92 1 1 55 215 78 48 96 217 193 170 187 22 106 16 104 105 85 100 6 98 100 22 19 83 20 100 88 8 22 1 23 89 23 93 91 99 31 93 34 99 95 8 1 98 44 139 81 92 51 57 30 85 89 14 15 15 1 15 8 1 9 88 99 10 15 16 147 119 147 4 148 139 19 19 138 4 2 2 2 8 8 8 8 8 8 13 13 14 14 1 1 14 14 3 12 14 2 13 11 7 14 14 14 14 14 1 2 6 5 3 5 5 7 7 6 7 6 5 3 3 3 6 6 1 6 3 6 7 6 161 34 156 156 18 18 2 16 18 4 4 4 4 30 6 23 23 30 39 39 39 38 1 1 37 38 1 39 38 1 39 39 104 104 1 105 105 104 2 102 105 99 1 3 105 105 104 92 13 105 92 38 25 12 12 1 1 10 10 2 9 10 1 10 10 4 31 27 5 104 105 105 13 92 93 38 55 92 4 11 48 37 37 11 11 37 37 4 10 119 2 116 80 7 7 7 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 | // SPDX-License-Identifier: GPL-2.0-only /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> */ /* * Changes: Pedro Roque : Retransmit queue handled by TCP. * : Fragmentation on mtu decrease * : Segment collapse on retransmit * : AF independence * * Linus Torvalds : send_delayed_ack * David S. Miller : Charge memory using the right skb * during syn/ack processing. * David S. Miller : Output engine completely rewritten. * Andrea Arcangeli: SYNACK carry ts_recent in tsecr. * Cacophonix Gaul : draft-minshall-nagle-01 * J Hadi Salim : ECN support * */ #define pr_fmt(fmt) "TCP: " fmt #include <net/tcp.h> #include <net/mptcp.h> #include <linux/compiler.h> #include <linux/gfp.h> #include <linux/module.h> #include <linux/static_key.h> #include <trace/events/tcp.h> /* Refresh clocks of a TCP socket, * ensuring monotically increasing values. */ void tcp_mstamp_refresh(struct tcp_sock *tp) { u64 val = tcp_clock_ns(); tp->tcp_clock_cache = val; tp->tcp_mstamp = div_u64(val, NSEC_PER_USEC); } static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); /* Account for new data that has been sent to the network. */ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned int prior_packets = tp->packets_out; WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq); __skb_unlink(skb, &sk->sk_write_queue); tcp_rbtree_insert(&sk->tcp_rtx_queue, skb); if (tp->highest_sack == NULL) tp->highest_sack = skb; tp->packets_out += tcp_skb_pcount(skb); if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT, tcp_skb_pcount(skb)); tcp_check_space(sk); } /* SND.NXT, if window was not shrunk or the amount of shrunk was less than one * window scaling factor due to loss of precision. * If window has been shrunk, what should we make? It is not clear at all. * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-( * Anything in between SND.UNA...SND.UNA+SND.WND also can be already * invalid. OK, let's make this for now: */ static inline __u32 tcp_acceptable_seq(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); if (!before(tcp_wnd_end(tp), tp->snd_nxt) || (tp->rx_opt.wscale_ok && ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale)))) return tp->snd_nxt; else return tcp_wnd_end(tp); } /* Calculate mss to advertise in SYN segment. * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that: * * 1. It is independent of path mtu. * 2. Ideally, it is maximal possible segment size i.e. 65535-40. * 3. For IPv4 it is reasonable to calculate it from maximal MTU of * attached devices, because some buggy hosts are confused by * large MSS. * 4. We do not make 3, we advertise MSS, calculated from first * hop device mtu, but allow to raise it to ip_rt_min_advmss. * This may be overridden via information stored in routing table. * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible, * probably even Jumbo". */ static __u16 tcp_advertise_mss(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); const struct dst_entry *dst = __sk_dst_get(sk); int mss = tp->advmss; if (dst) { unsigned int metric = dst_metric_advmss(dst); if (metric < mss) { mss = metric; tp->advmss = mss; } } return (__u16)mss; } /* RFC2861. Reset CWND after idle period longer RTO to "restart window". * This is the first part of cwnd validation mechanism. */ void tcp_cwnd_restart(struct sock *sk, s32 delta) { struct tcp_sock *tp = tcp_sk(sk); u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); u32 cwnd = tcp_snd_cwnd(tp); tcp_ca_event(sk, CA_EVENT_CWND_RESTART); tp->snd_ssthresh = tcp_current_ssthresh(sk); restart_cwnd = min(restart_cwnd, cwnd); while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd) cwnd >>= 1; tcp_snd_cwnd_set(tp, max(cwnd, restart_cwnd)); tp->snd_cwnd_stamp = tcp_jiffies32; tp->snd_cwnd_used = 0; } /* Congestion state accounting after a packet has been sent. */ static void tcp_event_data_sent(struct tcp_sock *tp, struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_jiffies32; if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); tp->lsndtime = now; /* If it is a reply for ato after last received * packet, increase pingpong count. */ if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) inet_csk_inc_pingpong_cnt(sk); } /* Account for an ACK we sent. */ static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt) { struct tcp_sock *tp = tcp_sk(sk); if (unlikely(tp->compressed_ack)) { NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, tp->compressed_ack); tp->compressed_ack = 0; if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) __sock_put(sk); } if (unlikely(rcv_nxt != tp->rcv_nxt)) return; /* Special ACK sent by DCTCP to reflect ECN */ tcp_dec_quickack_mode(sk); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } /* Determine a window scaling and initial window to offer. * Based on the assumption that the given amount of space * will be offered. Store the results in the tp structure. * NOTE: for smooth operation initial space offering should * be a multiple of mss if possible. We assume here that mss >= 1. * This MUST be enforced by all callers. */ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd) { unsigned int space = (__space < 0 ? 0 : __space); /* If no clamp set the clamp to the max possible scaled window */ if (*window_clamp == 0) (*window_clamp) = (U16_MAX << TCP_MAX_WSCALE); space = min(*window_clamp, space); /* Quantize space offering to a multiple of mss if possible. */ if (space > mss) space = rounddown(space, mss); /* NOTE: offering an initial window larger than 32767 * will break some buggy TCP stacks. If the admin tells us * it is likely we could be speaking with such a buggy stack * we will truncate our initial window offering to 32K-1 * unless the remote has sent us a window scaling option, * which we interpret as a sign the remote TCP is not * misinterpreting the window field as a signed quantity. */ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)) (*rcv_wnd) = min(space, MAX_TCP_WINDOW); else (*rcv_wnd) = min_t(u32, space, U16_MAX); if (init_rcv_wnd) *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); *rcv_wscale = 0; if (wscale_ok) { /* Set window scaling on max possible window */ space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); space = max_t(u32, space, READ_ONCE(sysctl_rmem_max)); space = min_t(u32, space, *window_clamp); *rcv_wscale = clamp_t(int, ilog2(space) - 15, 0, TCP_MAX_WSCALE); } /* Set the clamp no higher than max representable value */ (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp); } EXPORT_SYMBOL(tcp_select_initial_window); /* Chose a new window to advertise, update state in tcp_sock for the * socket, and return result with RFC1323 scaling applied. The return * value can be stuffed directly into th->window for an outgoing * frame. */ static u16 tcp_select_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); u32 old_win = tp->rcv_wnd; u32 cur_win, new_win; /* Make the window 0 if we failed to queue the data because we * are out of memory. The window is temporary, so we don't store * it on the socket. */ if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM)) return 0; cur_win = tcp_receive_window(tp); new_win = __tcp_select_window(sk); if (new_win < cur_win) { /* Danger Will Robinson! * Don't update rcv_wup/rcv_wnd here or else * we will not be able to advertise a zero * window in time. --DaveM * * Relax Will Robinson. */ if (!READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) || !tp->rx_opt.rcv_wscale) { /* Never shrink the offered window */ if (new_win == 0) NET_INC_STATS(net, LINUX_MIB_TCPWANTZEROWINDOWADV); new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); } } tp->rcv_wnd = new_win; tp->rcv_wup = tp->rcv_nxt; /* Make sure we do not exceed the maximum possible * scaled window. */ if (!tp->rx_opt.rcv_wscale && READ_ONCE(net->ipv4.sysctl_tcp_workaround_signed_windows)) new_win = min(new_win, MAX_TCP_WINDOW); else new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); /* RFC1323 scaling applied */ new_win >>= tp->rx_opt.rcv_wscale; /* If we advertise zero window, disable fast path. */ if (new_win == 0) { tp->pred_flags = 0; if (old_win) NET_INC_STATS(net, LINUX_MIB_TCPTOZEROWINDOWADV); } else if (old_win == 0) { NET_INC_STATS(net, LINUX_MIB_TCPFROMZEROWINDOWADV); } return new_win; } /* Packet ECN state for a SYN-ACK */ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) { const struct tcp_sock *tp = tcp_sk(sk); TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; if (!(tp->ecn_flags & TCP_ECN_OK)) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; else if (tcp_ca_needs_ecn(sk) || tcp_bpf_ca_needs_ecn(sk)) INET_ECN_xmit(sk); } /* Packet ECN state for a SYN. */ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 || tcp_ca_needs_ecn(sk) || bpf_needs_ecn; if (!use_ecn) { const struct dst_entry *dst = __sk_dst_get(sk); if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) use_ecn = true; } tp->ecn_flags = 0; if (use_ecn) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; tp->ecn_flags = TCP_ECN_OK; if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) INET_ECN_xmit(sk); } } static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) { if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) /* tp->ecn_flags are cleared at a later point in time when * SYN ACK is ultimatively being received. */ TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR); } static void tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) { if (inet_rsk(req)->ecn_ok) th->ece = 1; } /* Set up ECN state for a packet on a ESTABLISHED socket that is about to * be sent. */ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, int tcp_header_len) { struct tcp_sock *tp = tcp_sk(sk); if (tp->ecn_flags & TCP_ECN_OK) { /* Not-retransmitted data segment: set ECT and inject CWR. */ if (skb->len != tcp_header_len && !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) { INET_ECN_xmit(sk); if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) { tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; th->cwr = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; } } else if (!tcp_ca_needs_ecn(sk)) { /* ACK or retransmitted segment: clear ECT|CE */ INET_ECN_dontxmit(sk); } if (tp->ecn_flags & TCP_ECN_DEMAND_CWR) th->ece = 1; } } /* Constructs common control bits of non-data skb. If SYN/FIN is present, * auto increment end seqno. */ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) { skb->ip_summed = CHECKSUM_PARTIAL; TCP_SKB_CB(skb)->tcp_flags = flags; tcp_skb_pcount_set(skb, 1); TCP_SKB_CB(skb)->seq = seq; if (flags & (TCPHDR_SYN | TCPHDR_FIN)) seq++; TCP_SKB_CB(skb)->end_seq = seq; } static inline bool tcp_urg_mode(const struct tcp_sock *tp) { return tp->snd_una != tp->snd_up; } #define OPTION_SACK_ADVERTISE BIT(0) #define OPTION_TS BIT(1) #define OPTION_MD5 BIT(2) #define OPTION_WSCALE BIT(3) #define OPTION_FAST_OPEN_COOKIE BIT(8) #define OPTION_SMC BIT(9) #define OPTION_MPTCP BIT(10) #define OPTION_AO BIT(11) static void smc_options_write(__be32 *ptr, u16 *options) { #if IS_ENABLED(CONFIG_SMC) if (static_branch_unlikely(&tcp_have_smc)) { if (unlikely(OPTION_SMC & *options)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_EXP << 8) | (TCPOLEN_EXP_SMC_BASE)); *ptr++ = htonl(TCPOPT_SMC_MAGIC); } } #endif } struct tcp_out_options { u16 options; /* bit field of OPTION_* */ u16 mss; /* 0 to disable */ u8 ws; /* window scale, 0 to disable */ u8 num_sack_blocks; /* number of SACK blocks to include */ u8 hash_size; /* bytes in hash_location */ u8 bpf_opt_len; /* length of BPF hdr option */ __u8 *hash_location; /* temporary pointer, overloaded */ __u32 tsval, tsecr; /* need to include OPTION_TS */ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ struct mptcp_out_options mptcp; }; static void mptcp_options_write(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp, struct tcp_out_options *opts) { #if IS_ENABLED(CONFIG_MPTCP) if (unlikely(OPTION_MPTCP & opts->options)) mptcp_write_options(th, ptr, tp, &opts->mptcp); #endif } #ifdef CONFIG_CGROUP_BPF static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb, enum tcp_synack_type synack_type) { if (unlikely(!skb)) return BPF_WRITE_HDR_TCP_CURRENT_MSS; if (unlikely(synack_type == TCP_SYNACK_COOKIE)) return BPF_WRITE_HDR_TCP_SYNACK_COOKIE; return 0; } /* req, syn_skb and synack_type are used when writing synack */ static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct sk_buff *syn_skb, enum tcp_synack_type synack_type, struct tcp_out_options *opts, unsigned int *remaining) { struct bpf_sock_ops_kern sock_ops; int err; if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) || !*remaining) return; /* *remaining has already been aligned to 4 bytes, so *remaining >= 4 */ /* init sock_ops */ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB; if (req) { /* The listen "sk" cannot be passed here because * it is not locked. It would not make too much * sense to do bpf_setsockopt(listen_sk) based * on individual connection request also. * * Thus, "req" is passed here and the cgroup-bpf-progs * of the listen "sk" will be run. * * "req" is also used here for fastopen even the "sk" here is * a fullsock "child" sk. It is to keep the behavior * consistent between fastopen and non-fastopen on * the bpf programming side. */ sock_ops.sk = (struct sock *)req; sock_ops.syn_skb = syn_skb; } else { sock_owned_by_me(sk); sock_ops.is_fullsock = 1; sock_ops.sk = sk; } sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type); sock_ops.remaining_opt_len = *remaining; /* tcp_current_mss() does not pass a skb */ if (skb) bpf_skops_init_skb(&sock_ops, skb, 0); err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk); if (err || sock_ops.remaining_opt_len == *remaining) return; opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len; /* round up to 4 bytes */ opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3; *remaining -= opts->bpf_opt_len; } static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct sk_buff *syn_skb, enum tcp_synack_type synack_type, struct tcp_out_options *opts) { u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len; struct bpf_sock_ops_kern sock_ops; int err; if (likely(!max_opt_len)) return; memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB; if (req) { sock_ops.sk = (struct sock *)req; sock_ops.syn_skb = syn_skb; } else { sock_owned_by_me(sk); sock_ops.is_fullsock = 1; sock_ops.sk = sk; } sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type); sock_ops.remaining_opt_len = max_opt_len; first_opt_off = tcp_hdrlen(skb) - max_opt_len; bpf_skops_init_skb(&sock_ops, skb, first_opt_off); err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk); if (err) nr_written = 0; else nr_written = max_opt_len - sock_ops.remaining_opt_len; if (nr_written < max_opt_len) memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP, max_opt_len - nr_written); } #else static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct sk_buff *syn_skb, enum tcp_synack_type synack_type, struct tcp_out_options *opts, unsigned int *remaining) { } static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct sk_buff *syn_skb, enum tcp_synack_type synack_type, struct tcp_out_options *opts) { } #endif static __be32 *process_tcp_ao_options(struct tcp_sock *tp, const struct tcp_request_sock *tcprsk, struct tcp_out_options *opts, struct tcp_key *key, __be32 *ptr) { #ifdef CONFIG_TCP_AO u8 maclen = tcp_ao_maclen(key->ao_key); if (tcprsk) { u8 aolen = maclen + sizeof(struct tcp_ao_hdr); *ptr++ = htonl((TCPOPT_AO << 24) | (aolen << 16) | (tcprsk->ao_keyid << 8) | (tcprsk->ao_rcv_next)); } else { struct tcp_ao_key *rnext_key; struct tcp_ao_info *ao_info; ao_info = rcu_dereference_check(tp->ao_info, lockdep_sock_is_held(&tp->inet_conn.icsk_inet.sk)); rnext_key = READ_ONCE(ao_info->rnext_key); if (WARN_ON_ONCE(!rnext_key)) return ptr; *ptr++ = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key->ao_key) << 16) | (key->ao_key->sndid << 8) | (rnext_key->rcvid)); } opts->hash_location = (__u8 *)ptr; ptr += maclen / sizeof(*ptr); if (unlikely(maclen % sizeof(*ptr))) { memset(ptr, TCPOPT_NOP, sizeof(*ptr)); ptr++; } #endif return ptr; } /* Write previously computed TCP options to the packet. * * Beware: Something in the Internet is very sensitive to the ordering of * TCP options, we learned this through the hard way, so be careful here. * Luckily we can at least blame others for their non-compliance but from * inter-operability perspective it seems that we're somewhat stuck with * the ordering which we have been using if we want to keep working with * those broken things (not that it currently hurts anybody as there isn't * particular reason why the ordering would need to be changed). * * At least SACK_PERM as the first option is known to lead to a disaster * (but it may well be that other scenarios fail similarly). */ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, const struct tcp_request_sock *tcprsk, struct tcp_out_options *opts, struct tcp_key *key) { __be32 *ptr = (__be32 *)(th + 1); u16 options = opts->options; /* mungable copy */ if (tcp_key_is_md5(key)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); /* overload cookie hash location */ opts->hash_location = (__u8 *)ptr; ptr += 4; } else if (tcp_key_is_ao(key)) { ptr = process_tcp_ao_options(tp, tcprsk, opts, key, ptr); } if (unlikely(opts->mss)) { *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | opts->mss); } if (likely(OPTION_TS & options)) { if (unlikely(OPTION_SACK_ADVERTISE & options)) { *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); options &= ~OPTION_SACK_ADVERTISE; } else { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); } *ptr++ = htonl(opts->tsval); *ptr++ = htonl(opts->tsecr); } if (unlikely(OPTION_SACK_ADVERTISE & options)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); } if (unlikely(OPTION_WSCALE & options)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | opts->ws); } if (unlikely(opts->num_sack_blocks)) { struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks; int this_sack; *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_SACK << 8) | (TCPOLEN_SACK_BASE + (opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK))); for (this_sack = 0; this_sack < opts->num_sack_blocks; ++this_sack) { *ptr++ = htonl(sp[this_sack].start_seq); *ptr++ = htonl(sp[this_sack].end_seq); } tp->rx_opt.dsack = 0; } if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { struct tcp_fastopen_cookie *foc = opts->fastopen_cookie; u8 *p = (u8 *)ptr; u32 len; /* Fast Open option length */ if (foc->exp) { len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; *ptr = htonl((TCPOPT_EXP << 24) | (len << 16) | TCPOPT_FASTOPEN_MAGIC); p += TCPOLEN_EXP_FASTOPEN_BASE; } else { len = TCPOLEN_FASTOPEN_BASE + foc->len; *p++ = TCPOPT_FASTOPEN; *p++ = len; } memcpy(p, foc->val, foc->len); if ((len & 3) == 2) { p[foc->len] = TCPOPT_NOP; p[foc->len + 1] = TCPOPT_NOP; } ptr += (len + 3) >> 2; } smc_options_write(ptr, &options); mptcp_options_write(th, ptr, tp, opts); } static void smc_set_option(const struct tcp_sock *tp, struct tcp_out_options *opts, unsigned int *remaining) { #if IS_ENABLED(CONFIG_SMC) if (static_branch_unlikely(&tcp_have_smc)) { if (tp->syn_smc) { if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { opts->options |= OPTION_SMC; *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; } } } #endif } static void smc_set_option_cond(const struct tcp_sock *tp, const struct inet_request_sock *ireq, struct tcp_out_options *opts, unsigned int *remaining) { #if IS_ENABLED(CONFIG_SMC) if (static_branch_unlikely(&tcp_have_smc)) { if (tp->syn_smc && ireq->smc_ok) { if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { opts->options |= OPTION_SMC; *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; } } } #endif } static void mptcp_set_option_cond(const struct request_sock *req, struct tcp_out_options *opts, unsigned int *remaining) { if (rsk_is_mptcp(req)) { unsigned int size; if (mptcp_synack_options(req, &size, &opts->mptcp)) { if (*remaining >= size) { opts->options |= OPTION_MPTCP; *remaining -= size; } } } } /* Compute TCP options for SYN packets. This is not the final * network wire format yet. */ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, struct tcp_out_options *opts, struct tcp_key *key) { struct tcp_sock *tp = tcp_sk(sk); unsigned int remaining = MAX_TCP_OPTION_SPACE; struct tcp_fastopen_request *fastopen = tp->fastopen_req; bool timestamps; /* Better than switch (key.type) as it has static branches */ if (tcp_key_is_md5(key)) { timestamps = false; opts->options |= OPTION_MD5; remaining -= TCPOLEN_MD5SIG_ALIGNED; } else { timestamps = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps); if (tcp_key_is_ao(key)) { opts->options |= OPTION_AO; remaining -= tcp_ao_len_aligned(key->ao_key); } } /* We always get an MSS option. The option bytes which will be seen in * normal data packets should timestamps be used, must be in the MSS * advertised. But we subtract them from tp->mss_cache so that * calculations in tcp_sendmsg are simpler etc. So account for this * fact here if necessary. If we don't do this correctly, as a * receiver we won't recognize data packets as being full sized when we * should, and thus we won't abide by the delayed ACK rules correctly. * SACKs don't matter, we never delay an ACK when we have any of those * going out. */ opts->mss = tcp_advertise_mss(sk); remaining -= TCPOLEN_MSS_ALIGNED; if (likely(timestamps)) { opts->options |= OPTION_TS; opts->tsval = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) + tp->tsoffset; opts->tsecr = tp->rx_opt.ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling))) { opts->ws = tp->rx_opt.rcv_wscale; opts->options |= OPTION_WSCALE; remaining -= TCPOLEN_WSCALE_ALIGNED; } if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_sack))) { opts->options |= OPTION_SACK_ADVERTISE; if (unlikely(!(OPTION_TS & opts->options))) remaining -= TCPOLEN_SACKPERM_ALIGNED; } if (fastopen && fastopen->cookie.len >= 0) { u32 need = fastopen->cookie.len; need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE : TCPOLEN_FASTOPEN_BASE; need = (need + 3) & ~3U; /* Align to 32 bits */ if (remaining >= need) { opts->options |= OPTION_FAST_OPEN_COOKIE; opts->fastopen_cookie = &fastopen->cookie; remaining -= need; tp->syn_fastopen = 1; tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0; } } smc_set_option(tp, opts, &remaining); if (sk_is_mptcp(sk)) { unsigned int size; if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) { opts->options |= OPTION_MPTCP; remaining -= size; } } bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); return MAX_TCP_OPTION_SPACE - remaining; } /* Set up TCP options for SYN-ACKs. */ static unsigned int tcp_synack_options(const struct sock *sk, struct request_sock *req, unsigned int mss, struct sk_buff *skb, struct tcp_out_options *opts, const struct tcp_key *key, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; if (tcp_key_is_md5(key)) { opts->options |= OPTION_MD5; remaining -= TCPOLEN_MD5SIG_ALIGNED; /* We can't fit any SACK blocks in a packet with MD5 + TS * options. There was discussion about disabling SACK * rather than TS in order to fit in better with old, * buggy kernels, but that was deemed to be unnecessary. */ if (synack_type != TCP_SYNACK_COOKIE) ireq->tstamp_ok &= !ireq->sack_ok; } else if (tcp_key_is_ao(key)) { opts->options |= OPTION_AO; remaining -= tcp_ao_len_aligned(key->ao_key); ireq->tstamp_ok &= !ireq->sack_ok; } /* We always send an MSS option. */ opts->mss = mss; remaining -= TCPOLEN_MSS_ALIGNED; if (likely(ireq->wscale_ok)) { opts->ws = ireq->rcv_wscale; opts->options |= OPTION_WSCALE; remaining -= TCPOLEN_WSCALE_ALIGNED; } if (likely(ireq->tstamp_ok)) { opts->options |= OPTION_TS; opts->tsval = tcp_skb_timestamp_ts(tcp_rsk(req)->req_usec_ts, skb) + tcp_rsk(req)->ts_off; opts->tsecr = READ_ONCE(req->ts_recent); remaining -= TCPOLEN_TSTAMP_ALIGNED; } if (likely(ireq->sack_ok)) { opts->options |= OPTION_SACK_ADVERTISE; if (unlikely(!ireq->tstamp_ok)) remaining -= TCPOLEN_SACKPERM_ALIGNED; } if (foc != NULL && foc->len >= 0) { u32 need = foc->len; need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE : TCPOLEN_FASTOPEN_BASE; need = (need + 3) & ~3U; /* Align to 32 bits */ if (remaining >= need) { opts->options |= OPTION_FAST_OPEN_COOKIE; opts->fastopen_cookie = foc; remaining -= need; } } mptcp_set_option_cond(req, opts, &remaining); smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb, synack_type, opts, &remaining); return MAX_TCP_OPTION_SPACE - remaining; } /* Compute TCP options for ESTABLISHED sockets. This is not the * final wire format yet. */ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb, struct tcp_out_options *opts, struct tcp_key *key) { struct tcp_sock *tp = tcp_sk(sk); unsigned int size = 0; unsigned int eff_sacks; opts->options = 0; /* Better than switch (key.type) as it has static branches */ if (tcp_key_is_md5(key)) { opts->options |= OPTION_MD5; size += TCPOLEN_MD5SIG_ALIGNED; } else if (tcp_key_is_ao(key)) { opts->options |= OPTION_AO; size += tcp_ao_len_aligned(key->ao_key); } if (likely(tp->rx_opt.tstamp_ok)) { opts->options |= OPTION_TS; opts->tsval = skb ? tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) + tp->tsoffset : 0; opts->tsecr = tp->rx_opt.ts_recent; size += TCPOLEN_TSTAMP_ALIGNED; } /* MPTCP options have precedence over SACK for the limited TCP * option space because a MPTCP connection would be forced to * fall back to regular TCP if a required multipath option is * missing. SACK still gets a chance to use whatever space is * left. */ if (sk_is_mptcp(sk)) { unsigned int remaining = MAX_TCP_OPTION_SPACE - size; unsigned int opt_size = 0; if (mptcp_established_options(sk, skb, &opt_size, remaining, &opts->mptcp)) { opts->options |= OPTION_MPTCP; size += opt_size; } } eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; if (unlikely(eff_sacks)) { const unsigned int remaining = MAX_TCP_OPTION_SPACE - size; if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED + TCPOLEN_SACK_PERBLOCK)) return size; opts->num_sack_blocks = min_t(unsigned int, eff_sacks, (remaining - TCPOLEN_SACK_BASE_ALIGNED) / TCPOLEN_SACK_PERBLOCK); size += TCPOLEN_SACK_BASE_ALIGNED + opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; } if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) { unsigned int remaining = MAX_TCP_OPTION_SPACE - size; bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); size = MAX_TCP_OPTION_SPACE - remaining; } return size; } /* TCP SMALL QUEUES (TSQ) * * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev) * to reduce RTT and bufferbloat. * We do this using a special skb destructor (tcp_wfree). * * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb * needs to be reallocated in a driver. * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc * * Since transmit from skb destructor is forbidden, we use a tasklet * to process all sockets that eventually need to send more skbs. * We use one tasklet per cpu, with its own queue of sockets. */ struct tsq_tasklet { struct tasklet_struct tasklet; struct list_head head; /* queue of tcp sockets */ }; static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet); static void tcp_tsq_write(struct sock *sk) { if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) { struct tcp_sock *tp = tcp_sk(sk); if (tp->lost_out > tp->retrans_out && tcp_snd_cwnd(tp) > tcp_packets_in_flight(tp)) { tcp_mstamp_refresh(tp); tcp_xmit_retransmit_queue(sk); } tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle, 0, GFP_ATOMIC); } } static void tcp_tsq_handler(struct sock *sk) { bh_lock_sock(sk); if (!sock_owned_by_user(sk)) tcp_tsq_write(sk); else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); bh_unlock_sock(sk); } /* * One tasklet per cpu tries to send more skbs. * We run in tasklet context but need to disable irqs when * transferring tsq->head because tcp_wfree() might * interrupt us (non NAPI drivers) */ static void tcp_tasklet_func(struct tasklet_struct *t) { struct tsq_tasklet *tsq = from_tasklet(tsq, t, tasklet); LIST_HEAD(list); unsigned long flags; struct list_head *q, *n; struct tcp_sock *tp; struct sock *sk; local_irq_save(flags); list_splice_init(&tsq->head, &list); local_irq_restore(flags); list_for_each_safe(q, n, &list) { tp = list_entry(q, struct tcp_sock, tsq_node); list_del(&tp->tsq_node); sk = (struct sock *)tp; smp_mb__before_atomic(); clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags); tcp_tsq_handler(sk); sk_free(sk); } } #define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \ TCPF_WRITE_TIMER_DEFERRED | \ TCPF_DELACK_TIMER_DEFERRED | \ TCPF_MTU_REDUCED_DEFERRED | \ TCPF_ACK_DEFERRED) /** * tcp_release_cb - tcp release_sock() callback * @sk: socket * * called from release_sock() to perform protocol dependent * actions before socket release. */ void tcp_release_cb(struct sock *sk) { unsigned long flags = smp_load_acquire(&sk->sk_tsq_flags); unsigned long nflags; /* perform an atomic operation only if at least one flag is set */ do { if (!(flags & TCP_DEFERRED_ALL)) return; nflags = flags & ~TCP_DEFERRED_ALL; } while (!try_cmpxchg(&sk->sk_tsq_flags, &flags, nflags)); if (flags & TCPF_TSQ_DEFERRED) { tcp_tsq_write(sk); __sock_put(sk); } if (flags & TCPF_WRITE_TIMER_DEFERRED) { tcp_write_timer_handler(sk); __sock_put(sk); } if (flags & TCPF_DELACK_TIMER_DEFERRED) { tcp_delack_timer_handler(sk); __sock_put(sk); } if (flags & TCPF_MTU_REDUCED_DEFERRED) { inet_csk(sk)->icsk_af_ops->mtu_reduced(sk); __sock_put(sk); } if ((flags & TCPF_ACK_DEFERRED) && inet_csk_ack_scheduled(sk)) tcp_send_ack(sk); } EXPORT_SYMBOL(tcp_release_cb); void __init tcp_tasklet_init(void) { int i; for_each_possible_cpu(i) { struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i); INIT_LIST_HEAD(&tsq->head); tasklet_setup(&tsq->tasklet, tcp_tasklet_func); } } /* * Write buffer destructor automatically called from kfree_skb. * We can't xmit new skbs from this context, as we might already * hold qdisc lock. */ void tcp_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; struct tcp_sock *tp = tcp_sk(sk); unsigned long flags, nval, oval; struct tsq_tasklet *tsq; bool empty; /* Keep one reference on sk_wmem_alloc. * Will be released by sk_free() from here or tcp_tasklet_func() */ WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc)); /* If this softirq is serviced by ksoftirqd, we are likely under stress. * Wait until our queues (qdisc + devices) are drained. * This gives : * - less callbacks to tcp_write_xmit(), reducing stress (batches) * - chance for incoming ACK (processed by another cpu maybe) * to migrate this flow (skb->ooo_okay will be eventually set) */ if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current) goto out; oval = smp_load_acquire(&sk->sk_tsq_flags); do { if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED)) goto out; nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED; } while (!try_cmpxchg(&sk->sk_tsq_flags, &oval, nval)); /* queue this socket to tasklet queue */ local_irq_save(flags); tsq = this_cpu_ptr(&tsq_tasklet); empty = list_empty(&tsq->head); list_add(&tp->tsq_node, &tsq->head); if (empty) tasklet_schedule(&tsq->tasklet); local_irq_restore(flags); return; out: sk_free(sk); } /* Note: Called under soft irq. * We can call TCP stack right away, unless socket is owned by user. */ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer) { struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer); struct sock *sk = (struct sock *)tp; tcp_tsq_handler(sk); sock_put(sk); return HRTIMER_NORESTART; } static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb, u64 prior_wstamp) { struct tcp_sock *tp = tcp_sk(sk); if (sk->sk_pacing_status != SK_PACING_NONE) { unsigned long rate = READ_ONCE(sk->sk_pacing_rate); /* Original sch_fq does not pace first 10 MSS * Note that tp->data_segs_out overflows after 2^32 packets, * this is a minor annoyance. */ if (rate != ~0UL && rate && tp->data_segs_out >= 10) { u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate); u64 credit = tp->tcp_wstamp_ns - prior_wstamp; /* take into account OS jitter */ len_ns -= min_t(u64, len_ns / 2, credit); tp->tcp_wstamp_ns += len_ns; } } list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); } INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)); /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. * All SKB's seen here are completely headerless. It is our * job to build the TCP header, and pass the packet down to * IP so it can do the same plus pass the packet off to the * device. * * We are working here with either a clone of the original * SKB, or a fresh unique copy made by the retransmit engine. */ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask, u32 rcv_nxt) { const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet; struct tcp_sock *tp; struct tcp_skb_cb *tcb; struct tcp_out_options opts; unsigned int tcp_options_size, tcp_header_size; struct sk_buff *oskb = NULL; struct tcp_key key; struct tcphdr *th; u64 prior_wstamp; int err; BUG_ON(!skb || !tcp_skb_pcount(skb)); tp = tcp_sk(sk); prior_wstamp = tp->tcp_wstamp_ns; tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true); if (clone_it) { oskb = skb; tcp_skb_tsorted_save(oskb) { if (unlikely(skb_cloned(oskb))) skb = pskb_copy(oskb, gfp_mask); else skb = skb_clone(oskb, gfp_mask); } tcp_skb_tsorted_restore(oskb); if (unlikely(!skb)) return -ENOBUFS; /* retransmit skbs might have a non zero value in skb->dev * because skb->dev is aliased with skb->rbnode.rb_left */ skb->dev = NULL; } inet = inet_sk(sk); tcb = TCP_SKB_CB(skb); memset(&opts, 0, sizeof(opts)); tcp_get_current_key(sk, &key); if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) { tcp_options_size = tcp_syn_options(sk, skb, &opts, &key); } else { tcp_options_size = tcp_established_options(sk, skb, &opts, &key); /* Force a PSH flag on all (GSO) packets to expedite GRO flush * at receiver : This slightly improve GRO performance. * Note that we do not force the PSH flag for non GSO packets, * because they might be sent under high congestion events, * and in this case it is better to delay the delivery of 1-MSS * packets and thus the corresponding ACK packet that would * release the following packet. */ if (tcp_skb_pcount(skb) > 1) tcb->tcp_flags |= TCPHDR_PSH; } tcp_header_size = tcp_options_size + sizeof(struct tcphdr); /* We set skb->ooo_okay to one if this packet can select * a different TX queue than prior packets of this flow, * to avoid self inflicted reorders. * The 'other' queue decision is based on current cpu number * if XPS is enabled, or sk->sk_txhash otherwise. * We can switch to another (and better) queue if: * 1) No packet with payload is in qdisc/device queues. * Delays in TX completion can defeat the test * even if packets were already sent. * 2) Or rtx queue is empty. * This mitigates above case if ACK packets for * all prior packets were already processed. */ skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) || tcp_rtx_queue_empty(sk); /* If we had to use memory reserve to allocate this skb, * this might cause drops if packet is looped back : * Other socket might not have SOCK_MEMALLOC. * Packets not looped back do not care about pfmemalloc. */ skb->pfmemalloc = 0; skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); skb_orphan(skb); skb->sk = sk; skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree; refcount_add(skb->truesize, &sk->sk_wmem_alloc); skb_set_dst_pending_confirm(skb, READ_ONCE(sk->sk_dst_pending_confirm)); /* Build TCP header and checksum it. */ th = (struct tcphdr *)skb->data; th->source = inet->inet_sport; th->dest = inet->inet_dport; th->seq = htonl(tcb->seq); th->ack_seq = htonl(rcv_nxt); *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->tcp_flags); th->check = 0; th->urg_ptr = 0; /* The urg_mode check is necessary during a below snd_una win probe */ if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) { if (before(tp->snd_up, tcb->seq + 0x10000)) { th->urg_ptr = htons(tp->snd_up - tcb->seq); th->urg = 1; } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) { th->urg_ptr = htons(0xFFFF); th->urg = 1; } } skb_shinfo(skb)->gso_type = sk->sk_gso_type; if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) { th->window = htons(tcp_select_window(sk)); tcp_ecn_send(sk, skb, th, tcp_header_size); } else { /* RFC1323: The window in SYN & SYN/ACK segments * is never scaled. */ th->window = htons(min(tp->rcv_wnd, 65535U)); } tcp_options_write(th, tp, NULL, &opts, &key); if (tcp_key_is_md5(&key)) { #ifdef CONFIG_TCP_MD5SIG /* Calculate the MD5 hash, as we have all we need now */ sk_gso_disable(sk); tp->af_specific->calc_md5_hash(opts.hash_location, key.md5_key, sk, skb); #endif } else if (tcp_key_is_ao(&key)) { int err; err = tcp_ao_transmit_skb(sk, skb, key.ao_key, th, opts.hash_location); if (err) { kfree_skb_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED); return -ENOMEM; } } /* BPF prog is the last one writing header option */ bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts); INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check, tcp_v6_send_check, tcp_v4_send_check, sk, skb); if (likely(tcb->tcp_flags & TCPHDR_ACK)) tcp_event_ack_sent(sk, rcv_nxt); if (skb->len != tcp_header_size) { tcp_event_data_sent(tp, sk); tp->data_segs_out += tcp_skb_pcount(skb); tp->bytes_sent += skb->len - tcp_header_size; } if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); tp->segs_out += tcp_skb_pcount(skb); skb_set_hash_from_sk(skb, sk); /* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */ skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb); skb_shinfo(skb)->gso_size = tcp_skb_mss(skb); /* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */ /* Cleanup our debris for IP stacks */ memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), sizeof(struct inet6_skb_parm))); tcp_add_tx_delay(skb, tp); err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit, inet6_csk_xmit, ip_queue_xmit, sk, skb, &inet->cork.fl); if (unlikely(err > 0)) { tcp_enter_cwr(sk); err = net_xmit_eval(err); } if (!err && oskb) { tcp_update_skb_after_send(sk, oskb, prior_wstamp); tcp_rate_skb_sent(sk, oskb); } return err; } static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask) { return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask, tcp_sk(sk)->rcv_nxt); } /* This routine just queues the buffer for sending. * * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, * otherwise socket can stall. */ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); /* Advance write_seq and place onto the write_queue. */ WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq); __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); } /* Initialize TSO segments for a packet. */ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) { if (skb->len <= mss_now) { /* Avoid the costly divide in the normal * non-TSO case. */ tcp_skb_pcount_set(skb, 1); TCP_SKB_CB(skb)->tcp_gso_size = 0; } else { tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now)); TCP_SKB_CB(skb)->tcp_gso_size = mss_now; } } /* Pcount in the middle of the write queue got changed, we need to do various * tweaks to fix counters */ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr) { struct tcp_sock *tp = tcp_sk(sk); tp->packets_out -= decr; if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= decr; if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) tp->retrans_out -= decr; if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) tp->lost_out -= decr; /* Reno case is special. Sigh... */ if (tcp_is_reno(tp) && decr > 0) tp->sacked_out -= min_t(u32, tp->sacked_out, decr); if (tp->lost_skb_hint && before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) tp->lost_cnt_hint -= decr; tcp_verify_left_out(tp); } static bool tcp_has_tx_tstamp(const struct sk_buff *skb) { return TCP_SKB_CB(skb)->txstamp_ack || (skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP); } static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2) { struct skb_shared_info *shinfo = skb_shinfo(skb); if (unlikely(tcp_has_tx_tstamp(skb)) && !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) { struct skb_shared_info *shinfo2 = skb_shinfo(skb2); u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP; shinfo->tx_flags &= ~tsflags; shinfo2->tx_flags |= tsflags; swap(shinfo->tskey, shinfo2->tskey); TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack; TCP_SKB_CB(skb)->txstamp_ack = 0; } } static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2) { TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor; TCP_SKB_CB(skb)->eor = 0; } /* Insert buff after skb on the write or rtx queue of sk. */ static void tcp_insert_write_queue_after(struct sk_buff *skb, struct sk_buff *buff, struct sock *sk, enum tcp_queue tcp_queue) { if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE) __skb_queue_after(&sk->sk_write_queue, skb, buff); else tcp_rbtree_insert(&sk->tcp_rtx_queue, buff); } /* Function to create two new TCP segments. Shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list. This won't be called frequently, I hope. * Remember, these are still headerless SKBs at this point. */ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, struct sk_buff *skb, u32 len, unsigned int mss_now, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; int old_factor; long limit; int nlen; u8 flags; if (WARN_ON(len > skb->len)) return -EINVAL; DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb)); /* tcp_sendmsg() can overshoot sk_wmem_queued by one full size skb. * We need some allowance to not penalize applications setting small * SO_SNDBUF values. * Also allow first and last skb in retransmit queue to be split. */ limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_LEGACY_MAX_SIZE); if (unlikely((sk->sk_wmem_queued >> 1) > limit && tcp_queue != TCP_FRAG_IN_WRITE_QUEUE && skb != tcp_rtx_queue_head(sk) && skb != tcp_rtx_queue_tail(sk))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG); return -ENOMEM; } if (skb_unclone_keeptruesize(skb, gfp)) return -ENOMEM; /* Get a new skb... force flag on. */ buff = tcp_stream_alloc_skb(sk, gfp, true); if (!buff) return -ENOMEM; /* We'll just try again later. */ skb_copy_decrypted(buff, skb); mptcp_skb_ext_copy(buff, skb); sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); nlen = skb->len - len; buff->truesize += nlen; skb->truesize -= nlen; /* Correct the sequence numbers. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; /* PSH and FIN should only be set in the second packet. */ flags = TCP_SKB_CB(skb)->tcp_flags; TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); TCP_SKB_CB(buff)->tcp_flags = flags; TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; tcp_skb_fragment_eor(skb, buff); skb_split(skb, buff, len); skb_set_delivery_time(buff, skb->tstamp, true); tcp_fragment_tstamp(skb, buff); old_factor = tcp_skb_pcount(skb); /* Fix up tso_factor for both original and new SKB. */ tcp_set_skb_tso_segs(skb, mss_now); tcp_set_skb_tso_segs(buff, mss_now); /* Update delivered info for the new segment */ TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx; /* If this packet has been sent out already, we must * adjust the various packet counters. */ if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) { int diff = old_factor - tcp_skb_pcount(skb) - tcp_skb_pcount(buff); if (diff) tcp_adjust_pcount(sk, skb, diff); } /* Link BUFF into the send queue. */ __skb_header_release(buff); tcp_insert_write_queue_after(skb, buff, sk, tcp_queue); if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE) list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor); return 0; } /* This is similar to __pskb_pull_tail(). The difference is that pulled * data is not copied, but immediately discarded. */ static int __pskb_trim_head(struct sk_buff *skb, int len) { struct skb_shared_info *shinfo; int i, k, eat; DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb)); eat = len; k = 0; shinfo = skb_shinfo(skb); for (i = 0; i < shinfo->nr_frags; i++) { int size = skb_frag_size(&shinfo->frags[i]); if (size <= eat) { skb_frag_unref(skb, i); eat -= size; } else { shinfo->frags[k] = shinfo->frags[i]; if (eat) { skb_frag_off_add(&shinfo->frags[k], eat); skb_frag_size_sub(&shinfo->frags[k], eat); eat = 0; } k++; } } shinfo->nr_frags = k; skb->data_len -= len; skb->len = skb->data_len; return len; } /* Remove acked data from a packet in the transmit queue. */ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) { u32 delta_truesize; if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) return -ENOMEM; delta_truesize = __pskb_trim_head(skb, len); TCP_SKB_CB(skb)->seq += len; skb->truesize -= delta_truesize; sk_wmem_queued_add(sk, -delta_truesize); if (!skb_zcopy_pure(skb)) sk_mem_uncharge(sk, delta_truesize); /* Any change of skb->len requires recalculation of tso factor. */ if (tcp_skb_pcount(skb) > 1) tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb)); return 0; } /* Calculate MSS not accounting any TCP options. */ static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); int mss_now; /* Calculate base mss without TCP options: It is MMS_S - sizeof(tcphdr) of rfc1122 */ mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr); /* Clamp it (mss_clamp does not include tcp options) */ if (mss_now > tp->rx_opt.mss_clamp) mss_now = tp->rx_opt.mss_clamp; /* Now subtract optional transport overhead */ mss_now -= icsk->icsk_ext_hdr_len; /* Then reserve room for full set of TCP options and 8 bytes of data */ mss_now = max(mss_now, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss)); return mss_now; } /* Calculate MSS. Not accounting for SACKs here. */ int tcp_mtu_to_mss(struct sock *sk, int pmtu) { /* Subtract TCP options size, not including SACKs */ return __tcp_mtu_to_mss(sk, pmtu) - (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr)); } EXPORT_SYMBOL(tcp_mtu_to_mss); /* Inverse of above */ int tcp_mss_to_mtu(struct sock *sk, int mss) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); return mss + tp->tcp_header_len + icsk->icsk_ext_hdr_len + icsk->icsk_af_ops->net_header_len; } EXPORT_SYMBOL(tcp_mss_to_mtu); /* MTU probing init per socket */ void tcp_mtup_init(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct net *net = sock_net(sk); icsk->icsk_mtup.enabled = READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing) > 1; icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + icsk->icsk_af_ops->net_header_len; icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, READ_ONCE(net->ipv4.sysctl_tcp_base_mss)); icsk->icsk_mtup.probe_size = 0; if (icsk->icsk_mtup.enabled) icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; } EXPORT_SYMBOL(tcp_mtup_init); /* This function synchronize snd mss to current pmtu/exthdr set. tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts for TCP options, but includes only bare TCP header. tp->rx_opt.mss_clamp is mss negotiated at connection setup. It is minimum of user_mss and mss received with SYN. It also does not include TCP options. inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function. tp->mss_cache is current effective sending mss, including all tcp options except for SACKs. It is evaluated, taking into account current pmtu, but never exceeds tp->rx_opt.mss_clamp. NOTE1. rfc1122 clearly states that advertised MSS DOES NOT include either tcp or ip options. NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache are READ ONLY outside this function. --ANK (980731) */ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); int mss_now; if (icsk->icsk_mtup.search_high > pmtu) icsk->icsk_mtup.search_high = pmtu; mss_now = tcp_mtu_to_mss(sk, pmtu); mss_now = tcp_bound_to_half_wnd(tp, mss_now); /* And store cached results */ icsk->icsk_pmtu_cookie = pmtu; if (icsk->icsk_mtup.enabled) mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)); tp->mss_cache = mss_now; return mss_now; } EXPORT_SYMBOL(tcp_sync_mss); /* Compute the current effective MSS, taking SACKs and IP options, * and even PMTU discovery events into account. */ unsigned int tcp_current_mss(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); const struct dst_entry *dst = __sk_dst_get(sk); u32 mss_now; unsigned int header_len; struct tcp_out_options opts; struct tcp_key key; mss_now = tp->mss_cache; if (dst) { u32 mtu = dst_mtu(dst); if (mtu != inet_csk(sk)->icsk_pmtu_cookie) mss_now = tcp_sync_mss(sk, mtu); } tcp_get_current_key(sk, &key); header_len = tcp_established_options(sk, NULL, &opts, &key) + sizeof(struct tcphdr); /* The mss_cache is sized based on tp->tcp_header_len, which assumes * some common options. If this is an odd packet (because we have SACK * blocks etc) then our calculated header_len will be different, and * we have to adjust mss_now correspondingly */ if (header_len != tp->tcp_header_len) { int delta = (int) header_len - tp->tcp_header_len; mss_now -= delta; } return mss_now; } /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. * As additional protections, we do not touch cwnd in retransmission phases, * and if application hit its sndbuf limit recently. */ static void tcp_cwnd_application_limited(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { /* Limited by application or receiver window. */ u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); u32 win_used = max(tp->snd_cwnd_used, init_win); if (win_used < tcp_snd_cwnd(tp)) { tp->snd_ssthresh = tcp_current_ssthresh(sk); tcp_snd_cwnd_set(tp, (tcp_snd_cwnd(tp) + win_used) >> 1); } tp->snd_cwnd_used = 0; } tp->snd_cwnd_stamp = tcp_jiffies32; } static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; struct tcp_sock *tp = tcp_sk(sk); /* Track the strongest available signal of the degree to which the cwnd * is fully utilized. If cwnd-limited then remember that fact for the * current window. If not cwnd-limited then track the maximum number of * outstanding packets in the current window. (If cwnd-limited then we * chose to not update tp->max_packets_out to avoid an extra else * clause with no functional impact.) */ if (!before(tp->snd_una, tp->cwnd_usage_seq) || is_cwnd_limited || (!tp->is_cwnd_limited && tp->packets_out > tp->max_packets_out)) { tp->is_cwnd_limited = is_cwnd_limited; tp->max_packets_out = tp->packets_out; tp->cwnd_usage_seq = tp->snd_nxt; } if (tcp_is_cwnd_limited(sk)) { /* Network is feed fully. */ tp->snd_cwnd_used = 0; tp->snd_cwnd_stamp = tcp_jiffies32; } else { /* Network starves. */ if (tp->packets_out > tp->snd_cwnd_used) tp->snd_cwnd_used = tp->packets_out; if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) && (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto && !ca_ops->cong_control) tcp_cwnd_application_limited(sk); /* The following conditions together indicate the starvation * is caused by insufficient sender buffer: * 1) just sent some data (see tcp_write_xmit) * 2) not cwnd limited (this else condition) * 3) no more data to send (tcp_write_queue_empty()) * 4) application is hitting buffer limit (SOCK_NOSPACE) */ if (tcp_write_queue_empty(sk) && sk->sk_socket && test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) && (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED); } } /* Minshall's variant of the Nagle send check. */ static bool tcp_minshall_check(const struct tcp_sock *tp) { return after(tp->snd_sml, tp->snd_una) && !after(tp->snd_sml, tp->snd_nxt); } /* Update snd_sml if this skb is under mss * Note that a TSO packet might end with a sub-mss segment * The test is really : * if ((skb->len % mss) != 0) * tp->snd_sml = TCP_SKB_CB(skb)->end_seq; * But we can avoid doing the divide again given we already have * skb_pcount = skb->len / mss_now */ static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, const struct sk_buff *skb) { if (skb->len < tcp_skb_pcount(skb) * mss_now) tp->snd_sml = TCP_SKB_CB(skb)->end_seq; } /* Return false, if packet can be sent now without violation Nagle's rules: * 1. It is full sized. (provided by caller in %partial bool) * 2. Or it contains FIN. (already checked by caller) * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. * 4. Or TCP_CORK is not set, and all sent packets are ACKed. * With Minshall's modification: all sent small packets are ACKed. */ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, int nonagle) { return partial && ((nonagle & TCP_NAGLE_CORK) || (!nonagle && tp->packets_out && tcp_minshall_check(tp))); } /* Return how many segs we'd like on a TSO packet, * depending on current pacing rate, and how close the peer is. * * Rationale is: * - For close peers, we rather send bigger packets to reduce * cpu costs, because occasional losses will be repaired fast. * - For long distance/rtt flows, we would like to get ACK clocking * with 1 ACK per ms. * * Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting * in bigger TSO bursts. We we cut the RTT-based allowance in half * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance * is below 1500 bytes after 6 * ~500 usec = 3ms. */ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, int min_tso_segs) { unsigned long bytes; u32 r; bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift); r = tcp_min_rtt(tcp_sk(sk)) >> READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_rtt_log); if (r < BITS_PER_TYPE(sk->sk_gso_max_size)) bytes += sk->sk_gso_max_size >> r; bytes = min_t(unsigned long, bytes, sk->sk_gso_max_size); return max_t(u32, bytes / mss_now, min_tso_segs); } /* Return the number of segments we want in the skb we are transmitting. * See if congestion control module wants to decide; otherwise, autosize. */ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; u32 min_tso, tso_segs; min_tso = ca_ops->min_tso_segs ? ca_ops->min_tso_segs(sk) : READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); return min_t(u32, tso_segs, sk->sk_gso_max_segs); } /* Returns the portion of skb which can be sent right away */ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, unsigned int mss_now, unsigned int max_segs, int nonagle) { const struct tcp_sock *tp = tcp_sk(sk); u32 partial, needed, window, max_len; window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; max_len = mss_now * max_segs; if (likely(max_len <= window && skb != tcp_write_queue_tail(sk))) return max_len; needed = min(skb->len, window); if (max_len <= needed) return max_len; partial = needed % mss_now; /* If last segment is not a full MSS, check if Nagle rules allow us * to include this last segment in this skb. * Otherwise, we'll split the skb at last MSS boundary */ if (tcp_nagle_check(partial != 0, tp, nonagle)) return needed - partial; return needed; } /* Can at least one segment of SKB be sent right now, according to the * congestion window rules? If so, return how many segments are allowed. */ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, const struct sk_buff *skb) { u32 in_flight, cwnd, halfcwnd; /* Don't be strict about the congestion window for the final FIN. */ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && tcp_skb_pcount(skb) == 1) return 1; in_flight = tcp_packets_in_flight(tp); cwnd = tcp_snd_cwnd(tp); if (in_flight >= cwnd) return 0; /* For better scheduling, ensure we have at least * 2 GSO packets in flight. */ halfcwnd = max(cwnd >> 1, 1U); return min(halfcwnd, cwnd - in_flight); } /* Initialize TSO state of a skb. * This must be invoked the first time we consider transmitting * SKB onto the wire. */ static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now) { int tso_segs = tcp_skb_pcount(skb); if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) { tcp_set_skb_tso_segs(skb, mss_now); tso_segs = tcp_skb_pcount(skb); } return tso_segs; } /* Return true if the Nagle test allows this packet to be * sent now. */ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, unsigned int cur_mss, int nonagle) { /* Nagle rule does not apply to frames, which sit in the middle of the * write_queue (they have no chances to get new data). * * This is implemented in the callers, where they modify the 'nonagle' * argument based upon the location of SKB in the send queue. */ if (nonagle & TCP_NAGLE_PUSH) return true; /* Don't use the nagle rule for urgent data (or for the final FIN). */ if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) return true; if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle)) return true; return false; } /* Does at least the first segment of SKB fit into the send window? */ static bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb, unsigned int cur_mss) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; if (skb->len > cur_mss) end_seq = TCP_SKB_CB(skb)->seq + cur_mss; return !after(end_seq, tcp_wnd_end(tp)); } /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet * which is put after SKB on the list. It is very much like * tcp_fragment() except that it may make several kinds of assumptions * in order to speed up the splitting operation. In particular, we * know that all the data is in scatter-gather pages, and that the * packet has never been sent out before (and thus is not cloned). */ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, unsigned int mss_now, gfp_t gfp) { int nlen = skb->len - len; struct sk_buff *buff; u8 flags; /* All of a TSO frame must be composed of paged data. */ DEBUG_NET_WARN_ON_ONCE(skb->len != skb->data_len); buff = tcp_stream_alloc_skb(sk, gfp, true); if (unlikely(!buff)) return -ENOMEM; skb_copy_decrypted(buff, skb); mptcp_skb_ext_copy(buff, skb); sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); buff->truesize += nlen; skb->truesize -= nlen; /* Correct the sequence numbers. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; /* PSH and FIN should only be set in the second packet. */ flags = TCP_SKB_CB(skb)->tcp_flags; TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); TCP_SKB_CB(buff)->tcp_flags = flags; tcp_skb_fragment_eor(skb, buff); skb_split(skb, buff, len); tcp_fragment_tstamp(skb, buff); /* Fix up tso_factor for both original and new SKB. */ tcp_set_skb_tso_segs(skb, mss_now); tcp_set_skb_tso_segs(buff, mss_now); /* Link BUFF into the send queue. */ __skb_header_release(buff); tcp_insert_write_queue_after(skb, buff, sk, TCP_FRAG_IN_WRITE_QUEUE); return 0; } /* Try to defer sending, if possible, in order to minimize the amount * of TSO splitting we do. View it as a kind of TSO Nagle test. * * This algorithm is from John Heffner. */ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, bool *is_cwnd_limited, bool *is_rwnd_limited, u32 max_segs) { const struct inet_connection_sock *icsk = inet_csk(sk); u32 send_win, cong_win, limit, in_flight; struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *head; int win_divisor; s64 delta; if (icsk->icsk_ca_state >= TCP_CA_Recovery) goto send_now; /* Avoid bursty behavior by allowing defer * only if the last write was recent (1 ms). * Note that tp->tcp_wstamp_ns can be in the future if we have * packets waiting in a qdisc or device for EDT delivery. */ delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC; if (delta > 0) goto send_now; in_flight = tcp_packets_in_flight(tp); BUG_ON(tcp_skb_pcount(skb) <= 1); BUG_ON(tcp_snd_cwnd(tp) <= in_flight); send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; /* From in_flight test above, we know that cwnd > in_flight. */ cong_win = (tcp_snd_cwnd(tp) - in_flight) * tp->mss_cache; limit = min(send_win, cong_win); /* If a full-sized TSO skb can be sent, do it. */ if (limit >= max_segs * tp->mss_cache) goto send_now; /* Middle in queue won't get any more data, full sendable already? */ if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) goto send_now; win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor); if (win_divisor) { u32 chunk = min(tp->snd_wnd, tcp_snd_cwnd(tp) * tp->mss_cache); /* If at least some fraction of a window is available, * just use it. */ chunk /= win_divisor; if (limit >= chunk) goto send_now; } else { /* Different approach, try not to defer past a single * ACK. Receiver should ACK every other full sized * frame, so if we have space for more than 3 frames * then send now. */ if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache) goto send_now; } /* TODO : use tsorted_sent_queue ? */ head = tcp_rtx_queue_head(sk); if (!head) goto send_now; delta = tp->tcp_clock_cache - head->tstamp; /* If next ACK is likely to come too late (half srtt), do not defer */ if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0) goto send_now; /* Ok, it looks like it is advisable to defer. * Three cases are tracked : * 1) We are cwnd-limited * 2) We are rwnd-limited * 3) We are application limited. */ if (cong_win < send_win) { if (cong_win <= skb->len) { *is_cwnd_limited = true; return true; } } else { if (send_win <= skb->len) { *is_rwnd_limited = true; return true; } } /* If this packet won't get more data, do not wait. */ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || TCP_SKB_CB(skb)->eor) goto send_now; return true; send_now: return false; } static inline void tcp_mtu_check_reprobe(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); u32 interval; s32 delta; interval = READ_ONCE(net->ipv4.sysctl_tcp_probe_interval); delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp; if (unlikely(delta >= interval * HZ)) { int mss = tcp_current_mss(sk); /* Update current search range */ icsk->icsk_mtup.probe_size = 0; icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + icsk->icsk_af_ops->net_header_len; icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); /* Update probe time stamp */ icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; } } static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len) { struct sk_buff *skb, *next; skb = tcp_send_head(sk); tcp_for_write_queue_from_safe(skb, next, sk) { if (len <= skb->len) break; if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb) || !skb_pure_zcopy_same(skb, next)) return false; len -= skb->len; } return true; } static int tcp_clone_payload(struct sock *sk, struct sk_buff *to, int probe_size) { skb_frag_t *lastfrag = NULL, *fragto = skb_shinfo(to)->frags; int i, todo, len = 0, nr_frags = 0; const struct sk_buff *skb; if (!sk_wmem_schedule(sk, to->truesize + probe_size)) return -ENOMEM; skb_queue_walk(&sk->sk_write_queue, skb) { const skb_frag_t *fragfrom = skb_shinfo(skb)->frags; if (skb_headlen(skb)) return -EINVAL; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++, fragfrom++) { if (len >= probe_size) goto commit; todo = min_t(int, skb_frag_size(fragfrom), probe_size - len); len += todo; if (lastfrag && skb_frag_page(fragfrom) == skb_frag_page(lastfrag) && skb_frag_off(fragfrom) == skb_frag_off(lastfrag) + skb_frag_size(lastfrag)) { skb_frag_size_add(lastfrag, todo); continue; } if (unlikely(nr_frags == MAX_SKB_FRAGS)) return -E2BIG; skb_frag_page_copy(fragto, fragfrom); skb_frag_off_copy(fragto, fragfrom); skb_frag_size_set(fragto, todo); nr_frags++; lastfrag = fragto++; } } commit: WARN_ON_ONCE(len != probe_size); for (i = 0; i < nr_frags; i++) skb_frag_ref(to, i); skb_shinfo(to)->nr_frags = nr_frags; to->truesize += probe_size; to->len += probe_size; to->data_len += probe_size; __skb_header_release(to); return 0; } /* Create a new MTU probe if we are ready. * MTU probe is regularly attempting to increase the path MTU by * deliberately sending larger packets. This discovers routing * changes resulting in larger path MTUs. * * Returns 0 if we should wait to probe (no cwnd available), * 1 if a probe was sent, * -1 otherwise */ static int tcp_mtu_probe(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb, *nskb, *next; struct net *net = sock_net(sk); int probe_size; int size_needed; int copy, len; int mss_now; int interval; /* Not currently probing/verifying, * not in recovery, * have enough cwnd, and * not SACKing (the variable headers throw things off) */ if (likely(!icsk->icsk_mtup.enabled || icsk->icsk_mtup.probe_size || inet_csk(sk)->icsk_ca_state != TCP_CA_Open || tcp_snd_cwnd(tp) < 11 || tp->rx_opt.num_sacks || tp->rx_opt.dsack)) return -1; /* Use binary search for probe_size between tcp_mss_base, * and current mss_clamp. if (search_high - search_low) * smaller than a threshold, backoff from probing. */ mss_now = tcp_current_mss(sk); probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high + icsk->icsk_mtup.search_low) >> 1); size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache; interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low; /* When misfortune happens, we are reprobing actively, * and then reprobe timer has expired. We stick with current * probing process by not resetting search range to its orignal. */ if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) || interval < READ_ONCE(net->ipv4.sysctl_tcp_probe_threshold)) { /* Check whether enough time has elaplased for * another round of probing. */ tcp_mtu_check_reprobe(sk); return -1; } /* Have enough data in the send queue to probe? */ if (tp->write_seq - tp->snd_nxt < size_needed) return -1; if (tp->snd_wnd < size_needed) return -1; if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp))) return 0; /* Do we need to wait to drain cwnd? With none in flight, don't stall */ if (tcp_packets_in_flight(tp) + 2 > tcp_snd_cwnd(tp)) { if (!tcp_packets_in_flight(tp)) return -1; else return 0; } if (!tcp_can_coalesce_send_queue_head(sk, probe_size)) return -1; /* We're allowed to probe. Build it now. */ nskb = tcp_stream_alloc_skb(sk, GFP_ATOMIC, false); if (!nskb) return -1; /* build the payload, and be prepared to abort if this fails. */ if (tcp_clone_payload(sk, nskb, probe_size)) { tcp_skb_tsorted_anchor_cleanup(nskb); consume_skb(nskb); return -1; } sk_wmem_queued_add(sk, nskb->truesize); sk_mem_charge(sk, nskb->truesize); skb = tcp_send_head(sk); skb_copy_decrypted(nskb, skb); mptcp_skb_ext_copy(nskb, skb); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK; tcp_insert_write_queue_before(nskb, skb, sk); tcp_highest_sack_replace(sk, skb, nskb); len = 0; tcp_for_write_queue_from_safe(skb, next, sk) { copy = min_t(int, skb->len, probe_size - len); if (skb->len <= copy) { /* We've eaten all the data from this skb. * Throw it away. */ TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; /* If this is the last SKB we copy and eor is set * we need to propagate it to the new skb. */ TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor; tcp_skb_collapse_tstamp(nskb, skb); tcp_unlink_write_queue(skb, sk); tcp_wmem_free_skb(sk, skb); } else { TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags & ~(TCPHDR_FIN|TCPHDR_PSH); __pskb_trim_head(skb, copy); tcp_set_skb_tso_segs(skb, mss_now); TCP_SKB_CB(skb)->seq += copy; } len += copy; if (len >= probe_size) break; } tcp_init_tso_segs(nskb, nskb->len); /* We're ready to send. If this fails, the probe will * be resegmented into mss-sized pieces by tcp_write_xmit(). */ if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { /* Decrement cwnd here because we are sending * effectively two packets. */ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - 1); tcp_event_new_data_sent(sk, nskb); icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len); tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq; tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq; return 1; } return -1; } static bool tcp_pacing_check(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (!tcp_needs_internal_pacing(sk)) return false; if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache) return false; if (!hrtimer_is_queued(&tp->pacing_timer)) { hrtimer_start(&tp->pacing_timer, ns_to_ktime(tp->tcp_wstamp_ns), HRTIMER_MODE_ABS_PINNED_SOFT); sock_hold(sk); } return true; } static bool tcp_rtx_queue_empty_or_single_skb(const struct sock *sk) { const struct rb_node *node = sk->tcp_rtx_queue.rb_node; /* No skb in the rtx queue. */ if (!node) return true; /* Only one skb in rtx queue. */ return !node->rb_left && !node->rb_right; } /* TCP Small Queues : * Control number of packets in qdisc/devices to two packets / or ~1 ms. * (These limits are doubled for retransmits) * This allows for : * - better RTT estimation and ACK scheduling * - faster recovery * - high rates * Alas, some drivers / subsystems require a fair amount * of queued bytes to ensure line rate. * One example is wifi aggregation (802.11 AMPDU) */ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, unsigned int factor) { unsigned long limit; limit = max_t(unsigned long, 2 * skb->truesize, READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift)); if (sk->sk_pacing_status == SK_PACING_NONE) limit = min_t(unsigned long, limit, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes)); limit <<= factor; if (static_branch_unlikely(&tcp_tx_delay_enabled) && tcp_sk(sk)->tcp_tx_delay) { u64 extra_bytes = (u64)READ_ONCE(sk->sk_pacing_rate) * tcp_sk(sk)->tcp_tx_delay; /* TSQ is based on skb truesize sum (sk_wmem_alloc), so we * approximate our needs assuming an ~100% skb->truesize overhead. * USEC_PER_SEC is approximated by 2^20. * do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift. */ extra_bytes >>= (20 - 1); limit += extra_bytes; } if (refcount_read(&sk->sk_wmem_alloc) > limit) { /* Always send skb if rtx queue is empty or has one skb. * No need to wait for TX completion to call us back, * after softirq/tasklet schedule. * This helps when TX completions are delayed too much. */ if (tcp_rtx_queue_empty_or_single_skb(sk)) return false; set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); /* It is possible TX completion already happened * before we set TSQ_THROTTLED, so we must * test again the condition. */ smp_mb__after_atomic(); if (refcount_read(&sk->sk_wmem_alloc) > limit) return true; } return false; } static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new) { const u32 now = tcp_jiffies32; enum tcp_chrono old = tp->chrono_type; if (old > TCP_CHRONO_UNSPEC) tp->chrono_stat[old - 1] += now - tp->chrono_start; tp->chrono_start = now; tp->chrono_type = new; } void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type) { struct tcp_sock *tp = tcp_sk(sk); /* If there are multiple conditions worthy of tracking in a * chronograph then the highest priority enum takes precedence * over the other conditions. So that if something "more interesting" * starts happening, stop the previous chrono and start a new one. */ if (type > tp->chrono_type) tcp_chrono_set(tp, type); } void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type) { struct tcp_sock *tp = tcp_sk(sk); /* There are multiple conditions worthy of tracking in a * chronograph, so that the highest priority enum takes * precedence over the other conditions (see tcp_chrono_start). * If a condition stops, we only stop chrono tracking if * it's the "most interesting" or current chrono we are * tracking and starts busy chrono if we have pending data. */ if (tcp_rtx_and_write_queues_empty(sk)) tcp_chrono_set(tp, TCP_CHRONO_UNSPEC); else if (type == tp->chrono_type) tcp_chrono_set(tp, TCP_CHRONO_BUSY); } /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. * * LARGESEND note: !tcp_urg_mode is overkill, only frames between * snd_up-64k-mss .. snd_up cannot be large. However, taking into * account rare use of URG, this is not a big flaw. * * Send at most one packet when push_one > 0. Temporarily ignore * cwnd limit to force at most one packet out when push_one == 2. * Returns true, if no segments are in flight and we have queued segments, * but cannot send anything now because of SWS or another problem. */ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; unsigned int tso_segs, sent_pkts; int cwnd_quota; int result; bool is_cwnd_limited = false, is_rwnd_limited = false; u32 max_segs; sent_pkts = 0; tcp_mstamp_refresh(tp); if (!push_one) { /* Do MTU probing. */ result = tcp_mtu_probe(sk); if (!result) { return false; } else if (result > 0) { sent_pkts = 1; } } max_segs = tcp_tso_segs(sk, mss_now); while ((skb = tcp_send_head(sk))) { unsigned int limit; if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp_ns" is used as a start point for the retransmit timer */ tp->tcp_wstamp_ns = tp->tcp_clock_cache; skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true); list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); tcp_init_tso_segs(skb, mss_now); goto repair; /* Skip network transmission */ } if (tcp_pacing_check(sk)) break; tso_segs = tcp_init_tso_segs(skb, mss_now); BUG_ON(!tso_segs); cwnd_quota = tcp_cwnd_test(tp, skb); if (!cwnd_quota) { if (push_one == 2) /* Force out a loss probe pkt. */ cwnd_quota = 1; else break; } if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) { is_rwnd_limited = true; break; } if (tso_segs == 1) { if (unlikely(!tcp_nagle_test(tp, skb, mss_now, (tcp_skb_is_last(sk, skb) ? nonagle : TCP_NAGLE_PUSH)))) break; } else { if (!push_one && tcp_tso_should_defer(sk, skb, &is_cwnd_limited, &is_rwnd_limited, max_segs)) break; } limit = mss_now; if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, min_t(unsigned int, cwnd_quota, max_segs), nonagle); if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) break; if (tcp_small_queue_check(sk, skb, 0)) break; /* Argh, we hit an empty skb(), presumably a thread * is sleeping in sendmsg()/sk_stream_wait_memory(). * We do not want to send a pure-ack packet and have * a strange looking rtx queue with empty packet(s). */ if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) break; if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; repair: /* Advance the send_head. This one is sent out. * This call will increment packets_out. */ tcp_event_new_data_sent(sk, skb); tcp_minshall_update(tp, mss_now, skb); sent_pkts += tcp_skb_pcount(skb); if (push_one) break; } if (is_rwnd_limited) tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED); else tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED); is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tcp_snd_cwnd(tp)); if (likely(sent_pkts || is_cwnd_limited)) tcp_cwnd_validate(sk, is_cwnd_limited); if (likely(sent_pkts)) { if (tcp_in_cwnd_reduction(sk)) tp->prr_out += sent_pkts; /* Send one loss probe per tail loss episode. */ if (push_one != 2) tcp_schedule_loss_probe(sk, false); return false; } return !tp->packets_out && !tcp_write_queue_empty(sk); } bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); u32 timeout, timeout_us, rto_delta_us; int early_retrans; /* Don't do any loss probe on a Fast Open connection before 3WHS * finishes. */ if (rcu_access_pointer(tp->fastopen_rsk)) return false; early_retrans = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_early_retrans); /* Schedule a loss probe in 2*RTT for SACK capable connections * not in loss recovery, that are either limited by cwnd or application. */ if ((early_retrans != 3 && early_retrans != 4) || !tp->packets_out || !tcp_is_sack(tp) || (icsk->icsk_ca_state != TCP_CA_Open && icsk->icsk_ca_state != TCP_CA_CWR)) return false; /* Probe timeout is 2*rtt. Add minimum RTO to account * for delayed ack when there's one outstanding packet. If no RTT * sample is available then probe after TCP_TIMEOUT_INIT. */ if (tp->srtt_us) { timeout_us = tp->srtt_us >> 2; if (tp->packets_out == 1) timeout_us += tcp_rto_min_us(sk); else timeout_us += TCP_TIMEOUT_MIN_US; timeout = usecs_to_jiffies(timeout_us); } else { timeout = TCP_TIMEOUT_INIT; } /* If the RTO formula yields an earlier time, then use that time. */ rto_delta_us = advancing_rto ? jiffies_to_usecs(inet_csk(sk)->icsk_rto) : tcp_rto_delta_us(sk); /* How far in future is RTO? */ if (rto_delta_us > 0) timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us)); tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX); return true; } /* Thanks to skb fast clones, we can detect if a prior transmit of * a packet is still in a qdisc or driver queue. * In this case, there is very little point doing a retransmit ! */ static bool skb_still_in_host_queue(struct sock *sk, const struct sk_buff *skb) { if (unlikely(skb_fclone_busy(sk, skb))) { set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); smp_mb__after_atomic(); if (skb_fclone_busy(sk, skb)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); return true; } } return false; } /* When probe timeout (PTO) fires, try send a new segment if possible, else * retransmit the last segment. */ void tcp_send_loss_probe(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int pcount; int mss = tcp_current_mss(sk); /* At most one outstanding TLP */ if (tp->tlp_high_seq) goto rearm_timer; tp->tlp_retrans = 0; skb = tcp_send_head(sk); if (skb && tcp_snd_wnd_test(tp, skb, mss)) { pcount = tp->packets_out; tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); if (tp->packets_out > pcount) goto probe_sent; goto rearm_timer; } skb = skb_rb_last(&sk->tcp_rtx_queue); if (unlikely(!skb)) { WARN_ONCE(tp->packets_out, "invalid inflight: %u state %u cwnd %u mss %d\n", tp->packets_out, sk->sk_state, tcp_snd_cwnd(tp), mss); inet_csk(sk)->icsk_pending = 0; return; } if (skb_still_in_host_queue(sk, skb)) goto rearm_timer; pcount = tcp_skb_pcount(skb); if (WARN_ON(!pcount)) goto rearm_timer; if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, (pcount - 1) * mss, mss, GFP_ATOMIC))) goto rearm_timer; skb = skb_rb_next(skb); } if (WARN_ON(!skb || !tcp_skb_pcount(skb))) goto rearm_timer; if (__tcp_retransmit_skb(sk, skb, 1)) goto rearm_timer; tp->tlp_retrans = 1; probe_sent: /* Record snd_nxt for loss detection. */ tp->tlp_high_seq = tp->snd_nxt; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES); /* Reset s.t. tcp_rearm_rto will restart timer from now */ inet_csk(sk)->icsk_pending = 0; rearm_timer: tcp_rearm_rto(sk); } /* Push out any pending frames which were held back due to * TCP_CORK or attempt at coalescing tiny packets. * The socket must be locked by the caller. */ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, int nonagle) { /* If we are closed, the bytes will have to remain here. * In time closedown will finish, we empty the write queue and * all will be happy. */ if (unlikely(sk->sk_state == TCP_CLOSE)) return; if (tcp_write_xmit(sk, cur_mss, nonagle, 0, sk_gfp_mask(sk, GFP_ATOMIC))) tcp_check_probe_timer(sk); } /* Send _single_ skb sitting at the send head. This function requires * true push pending frames to setup probe timer etc. */ void tcp_push_one(struct sock *sk, unsigned int mss_now) { struct sk_buff *skb = tcp_send_head(sk); BUG_ON(!skb || skb->len < mss_now); tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation); } /* This function returns the amount that we can raise the * usable window based on the following constraints * * 1. The window can never be shrunk once it is offered (RFC 793) * 2. We limit memory per socket * * RFC 1122: * "the suggested [SWS] avoidance algorithm for the receiver is to keep * RECV.NEXT + RCV.WIN fixed until: * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)" * * i.e. don't raise the right edge of the window until you can raise * it at least MSS bytes. * * Unfortunately, the recommended algorithm breaks header prediction, * since header prediction assumes th->window stays fixed. * * Strictly speaking, keeping th->window fixed violates the receiver * side SWS prevention criteria. The problem is that under this rule * a stream of single byte packets will cause the right side of the * window to always advance by a single byte. * * Of course, if the sender implements sender side SWS prevention * then this will not be a problem. * * BSD seems to make the following compromise: * * If the free space is less than the 1/4 of the maximum * space available and the free space is less than 1/2 mss, * then set the window to 0. * [ Actually, bsd uses MSS and 1/4 of maximal _window_ ] * Otherwise, just prevent the window from shrinking * and from being larger than the largest representable value. * * This prevents incremental opening of the window in the regime * where TCP is limited by the speed of the reader side taking * data out of the TCP receive queue. It does nothing about * those cases where the window is constrained on the sender side * because the pipeline is full. * * BSD also seems to "accidentally" limit itself to windows that are a * multiple of MSS, at least until the free space gets quite small. * This would appear to be a side effect of the mbuf implementation. * Combining these two algorithms results in the observed behavior * of having a fixed window size at almost all times. * * Below we obtain similar behavior by forcing the offered window to * a multiple of the mss when it is feasible to do so. * * Note, we don't "adjust" for TIMESTAMP or SACK option bytes. * Regular options like TIMESTAMP are taken into account. */ u32 __tcp_select_window(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); /* MSS for the peer's data. Previous versions used mss_clamp * here. I don't know if the value based on our guesses * of peer's MSS is better for the performance. It's more correct * but may be worse for the performance because of rcv_mss * fluctuations. --SAW 1998/11/1 */ int mss = icsk->icsk_ack.rcv_mss; int free_space = tcp_space(sk); int allowed_space = tcp_full_space(sk); int full_space, window; if (sk_is_mptcp(sk)) mptcp_space(sk, &free_space, &allowed_space); full_space = min_t(int, tp->window_clamp, allowed_space); if (unlikely(mss > full_space)) { mss = full_space; if (mss <= 0) return 0; } /* Only allow window shrink if the sysctl is enabled and we have * a non-zero scaling factor in effect. */ if (READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) && tp->rx_opt.rcv_wscale) goto shrink_window_allowed; /* do not allow window to shrink */ if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; if (tcp_under_memory_pressure(sk)) tcp_adjust_rcv_ssthresh(sk); /* free_space might become our new window, make sure we don't * increase it due to wscale. */ free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale); /* if free space is less than mss estimate, or is below 1/16th * of the maximum allowed, try to move to zero-window, else * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and * new incoming data is dropped due to memory limits. * With large window, mss test triggers way too late in order * to announce zero window in time before rmem limit kicks in. */ if (free_space < (allowed_space >> 4) || free_space < mss) return 0; } if (free_space > tp->rcv_ssthresh) free_space = tp->rcv_ssthresh; /* Don't do rounding if we are using window scaling, since the * scaled window will not line up with the MSS boundary anyway. */ if (tp->rx_opt.rcv_wscale) { window = free_space; /* Advertise enough space so that it won't get scaled away. * Import case: prevent zero window announcement if * 1<<rcv_wscale > mss. */ window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale)); } else { window = tp->rcv_wnd; /* Get the largest window that is a nice multiple of mss. * Window clamp already applied above. * If our current window offering is within 1 mss of the * free space we just keep it. This prevents the divide * and multiply from happening most of the time. * We also don't do any window rounding when the free space * is too small. */ if (window <= free_space - mss || window > free_space) window = rounddown(free_space, mss); else if (mss == full_space && free_space > window + (full_space >> 1)) window = free_space; } return window; shrink_window_allowed: /* new window should always be an exact multiple of scaling factor */ free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale); if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; if (tcp_under_memory_pressure(sk)) tcp_adjust_rcv_ssthresh(sk); /* if free space is too low, return a zero window */ if (free_space < (allowed_space >> 4) || free_space < mss || free_space < (1 << tp->rx_opt.rcv_wscale)) return 0; } if (free_space > tp->rcv_ssthresh) { free_space = tp->rcv_ssthresh; /* new window should always be an exact multiple of scaling factor * * For this case, we ALIGN "up" (increase free_space) because * we know free_space is not zero here, it has been reduced from * the memory-based limit, and rcv_ssthresh is not a hard limit * (unlike sk_rcvbuf). */ free_space = ALIGN(free_space, (1 << tp->rx_opt.rcv_wscale)); } return free_space; } void tcp_skb_collapse_tstamp(struct sk_buff *skb, const struct sk_buff *next_skb) { if (unlikely(tcp_has_tx_tstamp(next_skb))) { const struct skb_shared_info *next_shinfo = skb_shinfo(next_skb); struct skb_shared_info *shinfo = skb_shinfo(skb); shinfo->tx_flags |= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP; shinfo->tskey = next_shinfo->tskey; TCP_SKB_CB(skb)->txstamp_ack |= TCP_SKB_CB(next_skb)->txstamp_ack; } } /* Collapses two adjacent SKB's during retransmission. */ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *next_skb = skb_rb_next(skb); int next_skb_size; next_skb_size = next_skb->len; BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); if (next_skb_size && !tcp_skb_shift(skb, next_skb, 1, next_skb_size)) return false; tcp_highest_sack_replace(sk, next_skb, skb); /* Update sequence range on original skb. */ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; /* Merge over control information. This moves PSH/FIN etc. over */ TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags; /* All done, get rid of second SKB and account for it so * packet counting does not break. */ TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS; TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor; /* changed transmit queue under us so clear hints */ tcp_clear_retrans_hints_partial(tp); if (next_skb == tp->retransmit_skb_hint) tp->retransmit_skb_hint = skb; tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb)); tcp_skb_collapse_tstamp(skb, next_skb); tcp_rtx_queue_unlink_and_free(next_skb, sk); return true; } /* Check if coalescing SKBs is legal. */ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb) { if (tcp_skb_pcount(skb) > 1) return false; if (skb_cloned(skb)) return false; /* Some heuristics for collapsing over SACK'd could be invented */ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) return false; return true; } /* Collapse packets in the retransmit queue to make to create * less packets on the wire. This is only done on retransmission. */ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, int space) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb = to, *tmp; bool first = true; if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse)) return; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) return; skb_rbtree_walk_from_safe(skb, tmp) { if (!tcp_can_collapse(sk, skb)) break; if (!tcp_skb_can_collapse(to, skb)) break; space -= skb->len; if (first) { first = false; continue; } if (space < 0) break; if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp))) break; if (!tcp_collapse_retrans(sk, to)) break; } } /* This retransmits one SKB. Policy decisions and retransmit queue * state updates are done by the caller. Returns non-zero if an * error occurred which prevented the send. */ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned int cur_mss; int diff, len, err; int avail_wnd; /* Inconclusive MTU probe */ if (icsk->icsk_mtup.probe_size) icsk->icsk_mtup.probe_size = 0; if (skb_still_in_host_queue(sk, skb)) return -EBUSY; start: if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN; TCP_SKB_CB(skb)->seq++; goto start; } if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) { WARN_ON_ONCE(1); return -EINVAL; } if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) return -ENOMEM; } if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) return -EHOSTUNREACH; /* Routing failure or similar. */ cur_mss = tcp_current_mss(sk); avail_wnd = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; /* If receiver has shrunk his window, and skb is out of * new window, do not retransmit it. The exception is the * case, when window is shrunk to zero. In this case * our retransmit of one segment serves as a zero window probe. */ if (avail_wnd <= 0) { if (TCP_SKB_CB(skb)->seq != tp->snd_una) return -EAGAIN; avail_wnd = cur_mss; } len = cur_mss * segs; if (len > avail_wnd) { len = rounddown(avail_wnd, cur_mss); if (!len) len = avail_wnd; } if (skb->len > len) { if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len, cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) return -ENOMEM; diff = tcp_skb_pcount(skb); tcp_set_skb_tso_segs(skb, cur_mss); diff -= tcp_skb_pcount(skb); if (diff) tcp_adjust_pcount(sk, skb, diff); avail_wnd = min_t(int, avail_wnd, cur_mss); if (skb->len < avail_wnd) tcp_retrans_try_collapse(sk, skb, avail_wnd); } /* RFC3168, section 6.1.1.1. ECN fallback */ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) tcp_ecn_clear_syn(sk, skb); /* Update global and local TCP statistics. */ segs = tcp_skb_pcount(skb); TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); tp->total_retrans += segs; tp->bytes_retrans += skb->len; /* make sure skb->data is aligned on arches that require it * and check if ack-trimming & collapsing extended the headroom * beyond what csum_start can cover. */ if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) || skb_headroom(skb) >= 0xFFFF)) { struct sk_buff *nskb; tcp_skb_tsorted_save(skb) { nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); if (nskb) { nskb->dev = NULL; err = tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC); } else { err = -ENOBUFS; } } tcp_skb_tsorted_restore(skb); if (!err) { tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns); tcp_rate_skb_sent(sk, skb); } } else { err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } /* To avoid taking spuriously low RTT samples based on a timestamp * for a transmit that never happened, always mark EVER_RETRANS */ TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG)) tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB, TCP_SKB_CB(skb)->seq, segs, err); if (likely(!err)) { trace_tcp_retransmit_skb(sk, skb); } else if (err != -EBUSY) { NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs); } return err; } int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) { struct tcp_sock *tp = tcp_sk(sk); int err = __tcp_retransmit_skb(sk, skb, segs); if (err == 0) { #if FASTRETRANS_DEBUG > 0 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { net_dbg_ratelimited("retrans_out leaked\n"); } #endif TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; tp->retrans_out += tcp_skb_pcount(skb); } /* Save stamp of the first (attempted) retransmit. */ if (!tp->retrans_stamp) tp->retrans_stamp = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb); if (tp->undo_retrans < 0) tp->undo_retrans = 0; tp->undo_retrans += tcp_skb_pcount(skb); return err; } /* This gets called after a retransmit timeout, and the initially * retransmitted data is acknowledged. It tries to continue * resending the rest of the retransmit queue, until either * we've sent it all or the congestion window limit is reached. */ void tcp_xmit_retransmit_queue(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb, *rtx_head, *hole = NULL; struct tcp_sock *tp = tcp_sk(sk); bool rearm_timer = false; u32 max_segs; int mib_idx; if (!tp->packets_out) return; rtx_head = tcp_rtx_queue_head(sk); skb = tp->retransmit_skb_hint ?: rtx_head; max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); skb_rbtree_walk_from(skb) { __u8 sacked; int segs; if (tcp_pacing_check(sk)) break; /* we could do better than to assign each time */ if (!hole) tp->retransmit_skb_hint = skb; segs = tcp_snd_cwnd(tp) - tcp_packets_in_flight(tp); if (segs <= 0) break; sacked = TCP_SKB_CB(skb)->sacked; /* In case tcp_shift_skb_data() have aggregated large skbs, * we need to make sure not sending too bigs TSO packets */ segs = min_t(int, segs, max_segs); if (tp->retrans_out >= tp->lost_out) { break; } else if (!(sacked & TCPCB_LOST)) { if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED))) hole = skb; continue; } else { if (icsk->icsk_ca_state != TCP_CA_Loss) mib_idx = LINUX_MIB_TCPFASTRETRANS; else mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS; } if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) continue; if (tcp_small_queue_check(sk, skb, 1)) break; if (tcp_retransmit_skb(sk, skb, segs)) break; NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb)); if (tcp_in_cwnd_reduction(sk)) tp->prr_out += tcp_skb_pcount(skb); if (skb == rtx_head && icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) rearm_timer = true; } if (rearm_timer) tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); } /* We allow to exceed memory limits for FIN packets to expedite * connection tear down and (memory) recovery. * Otherwise tcp_send_fin() could be tempted to either delay FIN * or even be forced to close flow without any FIN. * In general, we want to allow one skb per socket to avoid hangs * with edge trigger epoll() */ void sk_forced_mem_schedule(struct sock *sk, int size) { int delta, amt; delta = size - sk->sk_forward_alloc; if (delta <= 0) return; amt = sk_mem_pages(delta); sk_forward_alloc_add(sk, amt << PAGE_SHIFT); sk_memory_allocated_add(sk, amt); if (mem_cgroup_sockets_enabled && sk->sk_memcg) mem_cgroup_charge_skmem(sk->sk_memcg, amt, gfp_memcg_charge() | __GFP_NOFAIL); } /* Send a FIN. The caller locks the socket for us. * We should try to send a FIN packet really hard, but eventually give up. */ void tcp_send_fin(struct sock *sk) { struct sk_buff *skb, *tskb, *tail = tcp_write_queue_tail(sk); struct tcp_sock *tp = tcp_sk(sk); /* Optimization, tack on the FIN if we have one skb in write queue and * this skb was not yet sent, or we are under memory pressure. * Note: in the latter case, FIN packet will be sent after a timeout, * as TCP stack thinks it has already been transmitted. */ tskb = tail; if (!tskb && tcp_under_memory_pressure(sk)) tskb = skb_rb_last(&sk->tcp_rtx_queue); if (tskb) { TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; TCP_SKB_CB(tskb)->end_seq++; tp->write_seq++; if (!tail) { /* This means tskb was already sent. * Pretend we included the FIN on previous transmit. * We need to set tp->snd_nxt to the value it would have * if FIN had been sent. This is because retransmit path * does not change tp->snd_nxt. */ WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1); return; } } else { skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation); if (unlikely(!skb)) return; INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); skb_reserve(skb, MAX_TCP_HEADER); sk_forced_mem_schedule(sk, skb->truesize); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ tcp_init_nondata_skb(skb, tp->write_seq, TCPHDR_ACK | TCPHDR_FIN); tcp_queue_skb(sk, skb); } __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF); } /* We get here when a process closes a file descriptor (either due to * an explicit close() or as a byproduct of exit()'ing) and there * was unread data in the receive queue. This behavior is recommended * by RFC 2525, section 2.17. -DaveM */ void tcp_send_active_reset(struct sock *sk, gfp_t priority) { struct sk_buff *skb; TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS); /* NOTE: No TCP options attached and we never retransmit this. */ skb = alloc_skb(MAX_TCP_HEADER, priority); if (!skb) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); return; } /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, MAX_TCP_HEADER); tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), TCPHDR_ACK | TCPHDR_RST); tcp_mstamp_refresh(tcp_sk(sk)); /* Send it off. */ if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); /* skb of trace_tcp_send_reset() keeps the skb that caused RST, * skb here is different to the troublesome skb, so use NULL */ trace_tcp_send_reset(sk, NULL); } /* Send a crossed SYN-ACK during socket establishment. * WARNING: This routine must only be called when we have already sent * a SYN packet that crossed the incoming SYN that caused this routine * to get called. If this assumption fails then the initial rcv_wnd * and rcv_wscale values will not be correct. */ int tcp_send_synack(struct sock *sk) { struct sk_buff *skb; skb = tcp_rtx_queue_head(sk); if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { pr_err("%s: wrong queue state\n", __func__); return -EFAULT; } if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { if (skb_cloned(skb)) { struct sk_buff *nskb; tcp_skb_tsorted_save(skb) { nskb = skb_copy(skb, GFP_ATOMIC); } tcp_skb_tsorted_restore(skb); if (!nskb) return -ENOMEM; INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor); tcp_highest_sack_replace(sk, skb, nskb); tcp_rtx_queue_unlink_and_free(skb, sk); __skb_header_release(nskb); tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb); sk_wmem_queued_add(sk, nskb->truesize); sk_mem_charge(sk, nskb->truesize); skb = nskb; } TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; tcp_ecn_send_synack(sk, skb); } return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } /** * tcp_make_synack - Allocate one skb and build a SYNACK packet. * @sk: listener socket * @dst: dst entry attached to the SYNACK. It is consumed and caller * should not use it again. * @req: request_sock pointer * @foc: cookie for tcp fast open * @synack_type: Type of synack to prepare * @syn_skb: SYN packet just received. It could be NULL for rtx case. */ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); const struct tcp_sock *tp = tcp_sk(sk); struct tcp_out_options opts; struct tcp_key key = {}; struct sk_buff *skb; int tcp_header_size; struct tcphdr *th; int mss; u64 now; skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (unlikely(!skb)) { dst_release(dst); return NULL; } /* Reserve space for headers. */ skb_reserve(skb, MAX_TCP_HEADER); switch (synack_type) { case TCP_SYNACK_NORMAL: skb_set_owner_w(skb, req_to_sk(req)); break; case TCP_SYNACK_COOKIE: /* Under synflood, we do not attach skb to a socket, * to avoid false sharing. */ break; case TCP_SYNACK_FASTOPEN: /* sk is a const pointer, because we want to express multiple * cpu might call us concurrently. * sk->sk_wmem_alloc in an atomic, we can promote to rw. */ skb_set_owner_w(skb, (struct sock *)sk); break; } skb_dst_set(skb, dst); mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); memset(&opts, 0, sizeof(opts)); now = tcp_clock_ns(); #ifdef CONFIG_SYN_COOKIES if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok)) skb_set_delivery_time(skb, cookie_init_timestamp(req, now), true); else #endif { skb_set_delivery_time(skb, now, true); if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */ tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb); } #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) rcu_read_lock(); #endif if (tcp_rsk_used_ao(req)) { #ifdef CONFIG_TCP_AO struct tcp_ao_key *ao_key = NULL; u8 keyid = tcp_rsk(req)->ao_keyid; ao_key = tcp_sk(sk)->af_specific->ao_lookup(sk, req_to_sk(req), keyid, -1); /* If there is no matching key - avoid sending anything, * especially usigned segments. It could try harder and lookup * for another peer-matching key, but the peer has requested * ao_keyid (RFC5925 RNextKeyID), so let's keep it simple here. */ if (unlikely(!ao_key)) { rcu_read_unlock(); kfree_skb(skb); net_warn_ratelimited("TCP-AO: the keyid %u from SYN packet is not present - not sending SYNACK\n", keyid); return NULL; } key.ao_key = ao_key; key.type = TCP_KEY_AO; #endif } else { #ifdef CONFIG_TCP_MD5SIG key.md5_key = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); if (key.md5_key) key.type = TCP_KEY_MD5; #endif } skb_set_hash(skb, READ_ONCE(tcp_rsk(req)->txhash), PKT_HASH_TYPE_L4); /* bpf program will be interested in the tcp_flags */ TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK; tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &key, foc, synack_type, syn_skb) + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); th = (struct tcphdr *)skb->data; memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; tcp_ecn_make_synack(req, th); th->source = htons(ireq->ir_num); th->dest = ireq->ir_rmt_port; skb->mark = ireq->ir_mark; skb->ip_summed = CHECKSUM_PARTIAL; th->seq = htonl(tcp_rsk(req)->snt_isn); /* XXX data is queued and acked as is. No buffer/window check */ th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt); /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ th->window = htons(min(req->rsk_rcv_wnd, 65535U)); tcp_options_write(th, NULL, tcp_rsk(req), &opts, &key); th->doff = (tcp_header_size >> 2); TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); /* Okay, we have all we need - do the md5 hash if needed */ if (tcp_key_is_md5(&key)) { #ifdef CONFIG_TCP_MD5SIG tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location, key.md5_key, req_to_sk(req), skb); #endif } else if (tcp_key_is_ao(&key)) { #ifdef CONFIG_TCP_AO tcp_rsk(req)->af_specific->ao_synack_hash(opts.hash_location, key.ao_key, req, skb, opts.hash_location - (u8 *)th, 0); #endif } #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) rcu_read_unlock(); #endif bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb, synack_type, &opts); skb_set_delivery_time(skb, now, true); tcp_add_tx_delay(skb, tp); return skb; } EXPORT_SYMBOL(tcp_make_synack); static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); if (ca_key == TCP_CA_UNSPEC) return; rcu_read_lock(); ca = tcp_ca_find_key(ca_key); if (likely(ca && bpf_try_module_get(ca, ca->owner))) { bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner); icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); icsk->icsk_ca_ops = ca; } rcu_read_unlock(); } /* Do all connect socket setups that can be done AF independent. */ static void tcp_connect_init(struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); __u8 rcv_wscale; u32 rcv_wnd; /* We'll fix this up when we get a response from the other end. * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. */ tp->tcp_header_len = sizeof(struct tcphdr); if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps)) tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED; tcp_ao_connect_init(sk); /* If user gave his TCP_MAXSEG, record it to clamp */ if (tp->rx_opt.user_mss) tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; tp->max_window = 0; tcp_mtup_init(sk); tcp_sync_mss(sk, dst_mtu(dst)); tcp_ca_dst_init(sk, dst); if (!tp->window_clamp) tp->window_clamp = dst_metric(dst, RTAX_WINDOW); tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); tcp_initialize_rcv_mss(sk); /* limit the window selection if the user enforce a smaller rx buffer */ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0)) tp->window_clamp = tcp_full_space(sk); rcv_wnd = tcp_rwnd_init_bpf(sk); if (rcv_wnd == 0) rcv_wnd = dst_metric(dst, RTAX_INITRWND); tcp_select_initial_window(sk, tcp_full_space(sk), tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), &tp->rcv_wnd, &tp->window_clamp, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling), &rcv_wscale, rcv_wnd); tp->rx_opt.rcv_wscale = rcv_wscale; tp->rcv_ssthresh = tp->rcv_wnd; WRITE_ONCE(sk->sk_err, 0); sock_reset_flag(sk, SOCK_DONE); tp->snd_wnd = 0; tcp_init_wl(tp, 0); tcp_write_queue_purge(sk); tp->snd_una = tp->write_seq; tp->snd_sml = tp->write_seq; tp->snd_up = tp->write_seq; WRITE_ONCE(tp->snd_nxt, tp->write_seq); if (likely(!tp->repair)) tp->rcv_nxt = 0; else tp->rcv_tstamp = tcp_jiffies32; tp->rcv_wup = tp->rcv_nxt; WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); inet_csk(sk)->icsk_rto = tcp_timeout_init(sk); inet_csk(sk)->icsk_retransmits = 0; tcp_clear_retrans(tp); } static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); tcb->end_seq += skb->len; __skb_header_release(skb); sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); WRITE_ONCE(tp->write_seq, tcb->end_seq); tp->packets_out += tcp_skb_pcount(skb); } /* Build and send a SYN with data and (cached) Fast Open cookie. However, * queue a data-only packet after the regular SYN, such that regular SYNs * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges * only the SYN sequence, the data are retransmitted in the first ACK. * If cookie is not cached or other error occurs, falls back to send a * regular SYN with Fast Open cookie request option. */ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct tcp_fastopen_request *fo = tp->fastopen_req; struct page_frag *pfrag = sk_page_frag(sk); struct sk_buff *syn_data; int space, err = 0; tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie)) goto fallback; /* MSS for SYN-data is based on cached MSS and bounded by PMTU and * user-MSS. Reserve maximum option space for middleboxes that add * private TCP options. The cost is reduced data space in SYN :( */ tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp); /* Sync mss_cache after updating the mss_clamp */ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); space = __tcp_mtu_to_mss(sk, icsk->icsk_pmtu_cookie) - MAX_TCP_OPTION_SPACE; space = min_t(size_t, space, fo->size); if (space && !skb_page_frag_refill(min_t(size_t, space, PAGE_SIZE), pfrag, sk->sk_allocation)) goto fallback; syn_data = tcp_stream_alloc_skb(sk, sk->sk_allocation, false); if (!syn_data) goto fallback; memcpy(syn_data->cb, syn->cb, sizeof(syn->cb)); if (space) { space = min_t(size_t, space, pfrag->size - pfrag->offset); space = tcp_wmem_schedule(sk, space); } if (space) { space = copy_page_from_iter(pfrag->page, pfrag->offset, space, &fo->data->msg_iter); if (unlikely(!space)) { tcp_skb_tsorted_anchor_cleanup(syn_data); kfree_skb(syn_data); goto fallback; } skb_fill_page_desc(syn_data, 0, pfrag->page, pfrag->offset, space); page_ref_inc(pfrag->page); pfrag->offset += space; skb_len_add(syn_data, space); skb_zcopy_set(syn_data, fo->uarg, NULL); } /* No more data pending in inet_wait_for_connect() */ if (space == fo->size) fo->data = NULL; fo->copied = space; tcp_connect_queue_skb(sk, syn_data); if (syn_data->len) tcp_chrono_start(sk, TCP_CHRONO_BUSY); err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, true); /* Now full SYN+DATA was cloned and sent (or not), * remove the SYN from the original skb (syn_data) * we keep in write queue in case of a retransmit, as we * also have the SYN packet (with no data) in the same queue. */ TCP_SKB_CB(syn_data)->seq++; TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH; if (!err) { tp->syn_data = (fo->copied > 0); tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); goto done; } /* data was not sent, put it in write_queue */ __skb_queue_tail(&sk->sk_write_queue, syn_data); tp->packets_out -= tcp_skb_pcount(syn_data); fallback: /* Send a regular SYN with Fast Open cookie request option */ if (fo->cookie.len > 0) fo->cookie.len = 0; err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation); if (err) tp->syn_fastopen = 0; done: fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */ return err; } /* Build a SYN and send it off. */ int tcp_connect(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; int err; tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL); #if defined(CONFIG_TCP_MD5SIG) && defined(CONFIG_TCP_AO) /* Has to be checked late, after setting daddr/saddr/ops. * Return error if the peer has both a md5 and a tcp-ao key * configured as this is ambiguous. */ if (unlikely(rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)))) { bool needs_ao = !!tp->af_specific->ao_lookup(sk, sk, -1, -1); bool needs_md5 = !!tp->af_specific->md5_lookup(sk, sk); struct tcp_ao_info *ao_info; ao_info = rcu_dereference_check(tp->ao_info, lockdep_sock_is_held(sk)); if (ao_info) { /* This is an extra check: tcp_ao_required() in * tcp_v{4,6}_parse_md5_keys() should prevent adding * md5 keys on ao_required socket. */ needs_ao |= ao_info->ao_required; WARN_ON_ONCE(ao_info->ao_required && needs_md5); } if (needs_md5 && needs_ao) return -EKEYREJECTED; /* If we have a matching md5 key and no matching tcp-ao key * then free up ao_info if allocated. */ if (needs_md5) { tcp_ao_destroy_sock(sk, false); } else if (needs_ao) { tcp_clear_md5_list(sk); kfree(rcu_replace_pointer(tp->md5sig_info, NULL, lockdep_sock_is_held(sk))); } } #endif #ifdef CONFIG_TCP_AO if (unlikely(rcu_dereference_protected(tp->ao_info, lockdep_sock_is_held(sk)))) { /* Don't allow connecting if ao is configured but no * matching key is found. */ if (!tp->af_specific->ao_lookup(sk, sk, -1, -1)) return -EKEYREJECTED; } #endif if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) return -EHOSTUNREACH; /* Routing failure or similar. */ tcp_connect_init(sk); if (unlikely(tp->repair)) { tcp_finish_connect(sk, NULL); return 0; } buff = tcp_stream_alloc_skb(sk, sk->sk_allocation, true); if (unlikely(!buff)) return -ENOBUFS; tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); tcp_mstamp_refresh(tp); tp->retrans_stamp = tcp_time_stamp_ts(tp); tcp_connect_queue_skb(sk, buff); tcp_ecn_send_syn(sk, buff); tcp_rbtree_insert(&sk->tcp_rtx_queue, buff); /* Send off SYN; include data in Fast Open. */ err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); if (err == -ECONNREFUSED) return err; /* We change tp->snd_nxt after the tcp_transmit_skb() call * in order to make this packet get counted in tcpOutSegs. */ WRITE_ONCE(tp->snd_nxt, tp->write_seq); tp->pushed_seq = tp->write_seq; buff = tcp_send_head(sk); if (unlikely(buff)) { WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq); tp->pushed_seq = TCP_SKB_CB(buff)->seq; } TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); return 0; } EXPORT_SYMBOL(tcp_connect); u32 tcp_delack_max(const struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); u32 delack_max = inet_csk(sk)->icsk_delack_max; if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) { u32 rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN); u32 delack_from_rto_min = max_t(int, 1, rto_min - 1); delack_max = min_t(u32, delack_max, delack_from_rto_min); } return delack_max; } /* Send out a delayed ack, the caller does the policy checking * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check() * for details. */ void tcp_send_delayed_ack(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); int ato = icsk->icsk_ack.ato; unsigned long timeout; if (ato > TCP_DELACK_MIN) { const struct tcp_sock *tp = tcp_sk(sk); int max_ato = HZ / 2; if (inet_csk_in_pingpong_mode(sk) || (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)) max_ato = TCP_DELACK_MAX; /* Slow path, intersegment interval is "high". */ /* If some rtt estimate is known, use it to bound delayed ack. * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements * directly. */ if (tp->srtt_us) { int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3), TCP_DELACK_MIN); if (rtt < max_ato) max_ato = rtt; } ato = min(ato, max_ato); } ato = min_t(u32, ato, tcp_delack_max(sk)); /* Stay within the limit we were given */ timeout = jiffies + ato; /* Use new timeout only if there wasn't a older one earlier. */ if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { /* If delack timer is about to expire, send ACK now. */ if (time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) { tcp_send_ack(sk); return; } if (!time_before(timeout, icsk->icsk_ack.timeout)) timeout = icsk->icsk_ack.timeout; } icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; icsk->icsk_ack.timeout = timeout; sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); } /* This routine sends an ack and also updates the window. */ void __tcp_send_ack(struct sock *sk, u32 rcv_nxt) { struct sk_buff *buff; /* If we have been reset, we may not send again. */ if (sk->sk_state == TCP_CLOSE) return; /* We are not putting this on the write queue, so * tcp_transmit_skb() will set the ownership to this * sock. */ buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN)); if (unlikely(!buff)) { struct inet_connection_sock *icsk = inet_csk(sk); unsigned long delay; delay = TCP_DELACK_MAX << icsk->icsk_ack.retry; if (delay < TCP_RTO_MAX) icsk->icsk_ack.retry++; inet_csk_schedule_ack(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, TCP_RTO_MAX); return; } /* Reserve space for headers and prepare control bits. */ skb_reserve(buff, MAX_TCP_HEADER); tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK); /* We do not want pure acks influencing TCP Small Queues or fq/pacing * too much. * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784 */ skb_set_tcp_pure_ack(buff); /* Send it off, this clears delayed acks for us. */ __tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt); } EXPORT_SYMBOL_GPL(__tcp_send_ack); void tcp_send_ack(struct sock *sk) { __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt); } /* This routine sends a packet with an out of date sequence * number. It assumes the other end will try to ack it. * * Question: what should we make while urgent mode? * 4.4BSD forces sending single byte of data. We cannot send * out of window data, because we have SND.NXT==SND.MAX... * * Current solution: to send TWO zero-length segments in urgent mode: * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is * out-of-date with SND.UNA-1 to probe window. */ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; /* We don't queue it, tcp_transmit_skb() sets ownership. */ skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN)); if (!skb) return -1; /* Reserve space for headers and set control bits. */ skb_reserve(skb, MAX_TCP_HEADER); /* Use a previous sequence. This should cause the other * end to send an ack. Don't queue or clone SKB, just * send it. */ tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); NET_INC_STATS(sock_net(sk), mib); return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0); } /* Called from setsockopt( ... TCP_REPAIR ) */ void tcp_send_window_probe(struct sock *sk) { if (sk->sk_state == TCP_ESTABLISHED) { tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1; tcp_mstamp_refresh(tcp_sk(sk)); tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE); } } /* Initiate keepalive or window probe from timer. */ int tcp_write_wakeup(struct sock *sk, int mib) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; if (sk->sk_state == TCP_CLOSE) return -1; skb = tcp_send_head(sk); if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) { int err; unsigned int mss = tcp_current_mss(sk); unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) tp->pushed_seq = TCP_SKB_CB(skb)->end_seq; /* We are probing the opening of a window * but the window size is != 0 * must have been a result SWS avoidance ( sender ) */ if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || skb->len > mss) { seg_size = min(seg_size, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, skb, seg_size, mss, GFP_ATOMIC)) return -1; } else if (!tcp_skb_pcount(skb)) tcp_set_skb_tso_segs(skb, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); if (!err) tcp_event_new_data_sent(sk, skb); return err; } else { if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF)) tcp_xmit_probe_skb(sk, 1, mib); return tcp_xmit_probe_skb(sk, 0, mib); } } /* A window probe timeout has occurred. If window is not closed send * a partial packet else a zero probe. */ void tcp_send_probe0(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); unsigned long timeout; int err; err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); if (tp->packets_out || tcp_write_queue_empty(sk)) { /* Cancel probe timer, if it is not required. */ icsk->icsk_probes_out = 0; icsk->icsk_backoff = 0; icsk->icsk_probes_tstamp = 0; return; } icsk->icsk_probes_out++; if (err <= 0) { if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2)) icsk->icsk_backoff++; timeout = tcp_probe0_when(sk, TCP_RTO_MAX); } else { /* If packet was not sent due to local congestion, * Let senders fight for local resources conservatively. */ timeout = TCP_RESOURCE_PROBE_INTERVAL; } timeout = tcp_clamp_probe0_to_user_timeout(sk, timeout); tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX); } int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) { const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific; struct flowi fl; int res; /* Paired with WRITE_ONCE() in sock_setsockopt() */ if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED) WRITE_ONCE(tcp_rsk(req)->txhash, net_tx_rndhash()); res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL, NULL); if (!res) { TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); if (unlikely(tcp_passive_fastopen(sk))) { /* sk has const attribute because listeners are lockless. * However in this case, we are dealing with a passive fastopen * socket thus we can change total_retrans value. */ tcp_sk_rw(sk)->total_retrans++; } trace_tcp_retransmit_synack(sk, req); } return res; } EXPORT_SYMBOL(tcp_rtx_synack); |
4 1 3 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> * Copyright (c) 2013 Eric Leblond <eric@regit.org> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nft_reject.h> #include <linux/icmp.h> #include <linux/icmpv6.h> const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { [NFTA_REJECT_TYPE] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_REJECT_ICMP_CODE] = { .type = NLA_U8 }, }; EXPORT_SYMBOL_GPL(nft_reject_policy); int nft_reject_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) { return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_PRE_ROUTING)); } EXPORT_SYMBOL_GPL(nft_reject_validate); int nft_reject_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_reject *priv = nft_expr_priv(expr); int icmp_code; if (tb[NFTA_REJECT_TYPE] == NULL) return -EINVAL; priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: case NFT_REJECT_ICMPX_UNREACH: if (tb[NFTA_REJECT_ICMP_CODE] == NULL) return -EINVAL; icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); if (priv->type == NFT_REJECT_ICMPX_UNREACH && icmp_code > NFT_REJECT_ICMPX_MAX) return -EINVAL; priv->icmp_code = icmp_code; break; case NFT_REJECT_TCP_RST: break; default: return -EINVAL; } return 0; } EXPORT_SYMBOL_GPL(nft_reject_init); int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_reject *priv = nft_expr_priv(expr); if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type))) goto nla_put_failure; switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: case NFT_REJECT_ICMPX_UNREACH: if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) goto nla_put_failure; break; default: break; } return 0; nla_put_failure: return -1; } EXPORT_SYMBOL_GPL(nft_reject_dump); static u8 icmp_code_v4[NFT_REJECT_ICMPX_MAX + 1] = { [NFT_REJECT_ICMPX_NO_ROUTE] = ICMP_NET_UNREACH, [NFT_REJECT_ICMPX_PORT_UNREACH] = ICMP_PORT_UNREACH, [NFT_REJECT_ICMPX_HOST_UNREACH] = ICMP_HOST_UNREACH, [NFT_REJECT_ICMPX_ADMIN_PROHIBITED] = ICMP_PKT_FILTERED, }; int nft_reject_icmp_code(u8 code) { if (WARN_ON_ONCE(code > NFT_REJECT_ICMPX_MAX)) return ICMP_NET_UNREACH; return icmp_code_v4[code]; } EXPORT_SYMBOL_GPL(nft_reject_icmp_code); static u8 icmp_code_v6[NFT_REJECT_ICMPX_MAX + 1] = { [NFT_REJECT_ICMPX_NO_ROUTE] = ICMPV6_NOROUTE, [NFT_REJECT_ICMPX_PORT_UNREACH] = ICMPV6_PORT_UNREACH, [NFT_REJECT_ICMPX_HOST_UNREACH] = ICMPV6_ADDR_UNREACH, [NFT_REJECT_ICMPX_ADMIN_PROHIBITED] = ICMPV6_ADM_PROHIBITED, }; int nft_reject_icmpv6_code(u8 code) { if (WARN_ON_ONCE(code > NFT_REJECT_ICMPX_MAX)) return ICMPV6_NOROUTE; return icmp_code_v6[code]; } EXPORT_SYMBOL_GPL(nft_reject_icmpv6_code); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Netfilter x_tables over nftables module"); |
48 48 2 2 2 1 1 1 2 2 1 2 2 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 | // SPDX-License-Identifier: GPL-2.0-only /* * scsi_error.c Copyright (C) 1997 Eric Youngdale * * SCSI error/timeout handling * Initial versions: Eric Youngdale. Based upon conversations with * Leonard Zubkoff and David Miller at Linux Expo, * ideas originating from all over the place. * * Restructured scsi_unjam_host and associated functions. * September 04, 2002 Mike Anderson (andmike@us.ibm.com) * * Forward port of Russell King's (rmk@arm.linux.org.uk) changes and * minor cleanups. * September 30, 2002 Mike Anderson (andmike@us.ibm.com) */ #include <linux/module.h> #include <linux/sched.h> #include <linux/gfp.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/freezer.h> #include <linux/kthread.h> #include <linux/interrupt.h> #include <linux/blkdev.h> #include <linux/delay.h> #include <linux/jiffies.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_dbg.h> #include <scsi/scsi_device.h> #include <scsi/scsi_driver.h> #include <scsi/scsi_eh.h> #include <scsi/scsi_common.h> #include <scsi/scsi_transport.h> #include <scsi/scsi_host.h> #include <scsi/scsi_ioctl.h> #include <scsi/scsi_dh.h> #include <scsi/scsi_devinfo.h> #include <scsi/sg.h> #include "scsi_priv.h" #include "scsi_logging.h" #include "scsi_transport_api.h" #include <trace/events/scsi.h> #include <asm/unaligned.h> /* * These should *probably* be handled by the host itself. * Since it is allowed to sleep, it probably should. */ #define BUS_RESET_SETTLE_TIME (10) #define HOST_RESET_SETTLE_TIME (10) static int scsi_eh_try_stu(struct scsi_cmnd *scmd); static enum scsi_disposition scsi_try_to_abort_cmd(const struct scsi_host_template *, struct scsi_cmnd *); void scsi_eh_wakeup(struct Scsi_Host *shost, unsigned int busy) { lockdep_assert_held(shost->host_lock); if (busy == shost->host_failed) { trace_scsi_eh_wakeup(shost); wake_up_process(shost->ehandler); SCSI_LOG_ERROR_RECOVERY(5, shost_printk(KERN_INFO, shost, "Waking error handler thread\n")); } } /** * scsi_schedule_eh - schedule EH for SCSI host * @shost: SCSI host to invoke error handling on. * * Schedule SCSI EH without scmd. */ void scsi_schedule_eh(struct Scsi_Host *shost) { unsigned long flags; spin_lock_irqsave(shost->host_lock, flags); if (scsi_host_set_state(shost, SHOST_RECOVERY) == 0 || scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY) == 0) { shost->host_eh_scheduled++; scsi_eh_wakeup(shost, scsi_host_busy(shost)); } spin_unlock_irqrestore(shost->host_lock, flags); } EXPORT_SYMBOL_GPL(scsi_schedule_eh); static int scsi_host_eh_past_deadline(struct Scsi_Host *shost) { if (!shost->last_reset || shost->eh_deadline == -1) return 0; /* * 32bit accesses are guaranteed to be atomic * (on all supported architectures), so instead * of using a spinlock we can as well double check * if eh_deadline has been set to 'off' during the * time_before call. */ if (time_before(jiffies, shost->last_reset + shost->eh_deadline) && shost->eh_deadline > -1) return 0; return 1; } static bool scsi_cmd_retry_allowed(struct scsi_cmnd *cmd) { if (cmd->allowed == SCSI_CMD_RETRIES_NO_LIMIT) return true; return ++cmd->retries <= cmd->allowed; } static bool scsi_eh_should_retry_cmd(struct scsi_cmnd *cmd) { struct scsi_device *sdev = cmd->device; struct Scsi_Host *host = sdev->host; if (host->hostt->eh_should_retry_cmd) return host->hostt->eh_should_retry_cmd(cmd); return true; } /** * scmd_eh_abort_handler - Handle command aborts * @work: command to be aborted. * * Note: this function must be called only for a command that has timed out. * Because the block layer marks a request as complete before it calls * scsi_timeout(), a .scsi_done() call from the LLD for a command that has * timed out do not have any effect. Hence it is safe to call * scsi_finish_command() from this function. */ void scmd_eh_abort_handler(struct work_struct *work) { struct scsi_cmnd *scmd = container_of(work, struct scsi_cmnd, abort_work.work); struct scsi_device *sdev = scmd->device; struct Scsi_Host *shost = sdev->host; enum scsi_disposition rtn; unsigned long flags; if (scsi_host_eh_past_deadline(shost)) { SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "eh timeout, not aborting\n")); goto out; } SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "aborting command\n")); rtn = scsi_try_to_abort_cmd(shost->hostt, scmd); if (rtn != SUCCESS) { SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "cmd abort %s\n", (rtn == FAST_IO_FAIL) ? "not send" : "failed")); goto out; } set_host_byte(scmd, DID_TIME_OUT); if (scsi_host_eh_past_deadline(shost)) { SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "eh timeout, not retrying " "aborted command\n")); goto out; } spin_lock_irqsave(shost->host_lock, flags); list_del_init(&scmd->eh_entry); /* * If the abort succeeds, and there is no further * EH action, clear the ->last_reset time. */ if (list_empty(&shost->eh_abort_list) && list_empty(&shost->eh_cmd_q)) if (shost->eh_deadline != -1) shost->last_reset = 0; spin_unlock_irqrestore(shost->host_lock, flags); if (!scsi_noretry_cmd(scmd) && scsi_cmd_retry_allowed(scmd) && scsi_eh_should_retry_cmd(scmd)) { SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_WARNING, scmd, "retry aborted command\n")); scsi_queue_insert(scmd, SCSI_MLQUEUE_EH_RETRY); } else { SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_WARNING, scmd, "finish aborted command\n")); scsi_finish_command(scmd); } return; out: spin_lock_irqsave(shost->host_lock, flags); list_del_init(&scmd->eh_entry); spin_unlock_irqrestore(shost->host_lock, flags); scsi_eh_scmd_add(scmd); } /** * scsi_abort_command - schedule a command abort * @scmd: scmd to abort. * * We only need to abort commands after a command timeout */ static int scsi_abort_command(struct scsi_cmnd *scmd) { struct scsi_device *sdev = scmd->device; struct Scsi_Host *shost = sdev->host; unsigned long flags; if (!shost->hostt->eh_abort_handler) { /* No abort handler, fail command directly */ return FAILED; } if (scmd->eh_eflags & SCSI_EH_ABORT_SCHEDULED) { /* * Retry after abort failed, escalate to next level. */ SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "previous abort failed\n")); BUG_ON(delayed_work_pending(&scmd->abort_work)); return FAILED; } spin_lock_irqsave(shost->host_lock, flags); if (shost->eh_deadline != -1 && !shost->last_reset) shost->last_reset = jiffies; BUG_ON(!list_empty(&scmd->eh_entry)); list_add_tail(&scmd->eh_entry, &shost->eh_abort_list); spin_unlock_irqrestore(shost->host_lock, flags); scmd->eh_eflags |= SCSI_EH_ABORT_SCHEDULED; SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "abort scheduled\n")); queue_delayed_work(shost->tmf_work_q, &scmd->abort_work, HZ / 100); return SUCCESS; } /** * scsi_eh_reset - call into ->eh_action to reset internal counters * @scmd: scmd to run eh on. * * The scsi driver might be carrying internal state about the * devices, so we need to call into the driver to reset the * internal state once the error handler is started. */ static void scsi_eh_reset(struct scsi_cmnd *scmd) { if (!blk_rq_is_passthrough(scsi_cmd_to_rq(scmd))) { struct scsi_driver *sdrv = scsi_cmd_to_driver(scmd); if (sdrv->eh_reset) sdrv->eh_reset(scmd); } } static void scsi_eh_inc_host_failed(struct rcu_head *head) { struct scsi_cmnd *scmd = container_of(head, typeof(*scmd), rcu); struct Scsi_Host *shost = scmd->device->host; unsigned int busy = scsi_host_busy(shost); unsigned long flags; spin_lock_irqsave(shost->host_lock, flags); shost->host_failed++; scsi_eh_wakeup(shost, busy); spin_unlock_irqrestore(shost->host_lock, flags); } /** * scsi_eh_scmd_add - add scsi cmd to error handling. * @scmd: scmd to run eh on. */ void scsi_eh_scmd_add(struct scsi_cmnd *scmd) { struct Scsi_Host *shost = scmd->device->host; unsigned long flags; int ret; WARN_ON_ONCE(!shost->ehandler); WARN_ON_ONCE(!test_bit(SCMD_STATE_INFLIGHT, &scmd->state)); spin_lock_irqsave(shost->host_lock, flags); if (scsi_host_set_state(shost, SHOST_RECOVERY)) { ret = scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY); WARN_ON_ONCE(ret); } if (shost->eh_deadline != -1 && !shost->last_reset) shost->last_reset = jiffies; scsi_eh_reset(scmd); list_add_tail(&scmd->eh_entry, &shost->eh_cmd_q); spin_unlock_irqrestore(shost->host_lock, flags); /* * Ensure that all tasks observe the host state change before the * host_failed change. */ call_rcu_hurry(&scmd->rcu, scsi_eh_inc_host_failed); } /** * scsi_timeout - Timeout function for normal scsi commands. * @req: request that is timing out. * * Notes: * We do not need to lock this. There is the potential for a race * only in that the normal completion handling might run, but if the * normal completion function determines that the timer has already * fired, then it mustn't do anything. */ enum blk_eh_timer_return scsi_timeout(struct request *req) { struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req); struct Scsi_Host *host = scmd->device->host; trace_scsi_dispatch_cmd_timeout(scmd); scsi_log_completion(scmd, TIMEOUT_ERROR); atomic_inc(&scmd->device->iotmo_cnt); if (host->eh_deadline != -1 && !host->last_reset) host->last_reset = jiffies; if (host->hostt->eh_timed_out) { switch (host->hostt->eh_timed_out(scmd)) { case SCSI_EH_DONE: return BLK_EH_DONE; case SCSI_EH_RESET_TIMER: return BLK_EH_RESET_TIMER; case SCSI_EH_NOT_HANDLED: break; } } /* * If scsi_done() has already set SCMD_STATE_COMPLETE, do not modify * *scmd. */ if (test_and_set_bit(SCMD_STATE_COMPLETE, &scmd->state)) return BLK_EH_DONE; atomic_inc(&scmd->device->iodone_cnt); if (scsi_abort_command(scmd) != SUCCESS) { set_host_byte(scmd, DID_TIME_OUT); scsi_eh_scmd_add(scmd); } return BLK_EH_DONE; } /** * scsi_block_when_processing_errors - Prevent cmds from being queued. * @sdev: Device on which we are performing recovery. * * Description: * We block until the host is out of error recovery, and then check to * see whether the host or the device is offline. * * Return value: * 0 when dev was taken offline by error recovery. 1 OK to proceed. */ int scsi_block_when_processing_errors(struct scsi_device *sdev) { int online; wait_event(sdev->host->host_wait, !scsi_host_in_recovery(sdev->host)); online = scsi_device_online(sdev); return online; } EXPORT_SYMBOL(scsi_block_when_processing_errors); #ifdef CONFIG_SCSI_LOGGING /** * scsi_eh_prt_fail_stats - Log info on failures. * @shost: scsi host being recovered. * @work_q: Queue of scsi cmds to process. */ static inline void scsi_eh_prt_fail_stats(struct Scsi_Host *shost, struct list_head *work_q) { struct scsi_cmnd *scmd; struct scsi_device *sdev; int total_failures = 0; int cmd_failed = 0; int cmd_cancel = 0; int devices_failed = 0; shost_for_each_device(sdev, shost) { list_for_each_entry(scmd, work_q, eh_entry) { if (scmd->device == sdev) { ++total_failures; if (scmd->eh_eflags & SCSI_EH_ABORT_SCHEDULED) ++cmd_cancel; else ++cmd_failed; } } if (cmd_cancel || cmd_failed) { SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "%s: cmds failed: %d, cancel: %d\n", __func__, cmd_failed, cmd_cancel)); cmd_cancel = 0; cmd_failed = 0; ++devices_failed; } } SCSI_LOG_ERROR_RECOVERY(2, shost_printk(KERN_INFO, shost, "Total of %d commands on %d" " devices require eh work\n", total_failures, devices_failed)); } #endif /** * scsi_report_lun_change - Set flag on all *other* devices on the same target * to indicate that a UNIT ATTENTION is expected. * @sdev: Device reporting the UNIT ATTENTION */ static void scsi_report_lun_change(struct scsi_device *sdev) { sdev->sdev_target->expecting_lun_change = 1; } /** * scsi_report_sense - Examine scsi sense information and log messages for * certain conditions, also issue uevents for some of them. * @sdev: Device reporting the sense code * @sshdr: sshdr to be examined */ static void scsi_report_sense(struct scsi_device *sdev, struct scsi_sense_hdr *sshdr) { enum scsi_device_event evt_type = SDEV_EVT_MAXBITS; /* i.e. none */ if (sshdr->sense_key == UNIT_ATTENTION) { if (sshdr->asc == 0x3f && sshdr->ascq == 0x03) { evt_type = SDEV_EVT_INQUIRY_CHANGE_REPORTED; sdev_printk(KERN_WARNING, sdev, "Inquiry data has changed"); } else if (sshdr->asc == 0x3f && sshdr->ascq == 0x0e) { evt_type = SDEV_EVT_LUN_CHANGE_REPORTED; scsi_report_lun_change(sdev); sdev_printk(KERN_WARNING, sdev, "LUN assignments on this target have " "changed. The Linux SCSI layer does not " "automatically remap LUN assignments.\n"); } else if (sshdr->asc == 0x3f) sdev_printk(KERN_WARNING, sdev, "Operating parameters on this target have " "changed. The Linux SCSI layer does not " "automatically adjust these parameters.\n"); if (sshdr->asc == 0x38 && sshdr->ascq == 0x07) { evt_type = SDEV_EVT_SOFT_THRESHOLD_REACHED_REPORTED; sdev_printk(KERN_WARNING, sdev, "Warning! Received an indication that the " "LUN reached a thin provisioning soft " "threshold.\n"); } if (sshdr->asc == 0x29) { evt_type = SDEV_EVT_POWER_ON_RESET_OCCURRED; /* * Do not print message if it is an expected side-effect * of runtime PM. */ if (!sdev->silence_suspend) sdev_printk(KERN_WARNING, sdev, "Power-on or device reset occurred\n"); } if (sshdr->asc == 0x2a && sshdr->ascq == 0x01) { evt_type = SDEV_EVT_MODE_PARAMETER_CHANGE_REPORTED; sdev_printk(KERN_WARNING, sdev, "Mode parameters changed"); } else if (sshdr->asc == 0x2a && sshdr->ascq == 0x06) { evt_type = SDEV_EVT_ALUA_STATE_CHANGE_REPORTED; sdev_printk(KERN_WARNING, sdev, "Asymmetric access state changed"); } else if (sshdr->asc == 0x2a && sshdr->ascq == 0x09) { evt_type = SDEV_EVT_CAPACITY_CHANGE_REPORTED; sdev_printk(KERN_WARNING, sdev, "Capacity data has changed"); } else if (sshdr->asc == 0x2a) sdev_printk(KERN_WARNING, sdev, "Parameters changed"); } if (evt_type != SDEV_EVT_MAXBITS) { set_bit(evt_type, sdev->pending_events); schedule_work(&sdev->event_work); } } static inline void set_scsi_ml_byte(struct scsi_cmnd *cmd, u8 status) { cmd->result = (cmd->result & 0xffff00ff) | (status << 8); } /** * scsi_check_sense - Examine scsi cmd sense * @scmd: Cmd to have sense checked. * * Return value: * SUCCESS or FAILED or NEEDS_RETRY or ADD_TO_MLQUEUE * * Notes: * When a deferred error is detected the current command has * not been executed and needs retrying. */ enum scsi_disposition scsi_check_sense(struct scsi_cmnd *scmd) { struct request *req = scsi_cmd_to_rq(scmd); struct scsi_device *sdev = scmd->device; struct scsi_sense_hdr sshdr; if (! scsi_command_normalize_sense(scmd, &sshdr)) return FAILED; /* no valid sense data */ scsi_report_sense(sdev, &sshdr); if (scsi_sense_is_deferred(&sshdr)) return NEEDS_RETRY; if (sdev->handler && sdev->handler->check_sense) { enum scsi_disposition rc; rc = sdev->handler->check_sense(sdev, &sshdr); if (rc != SCSI_RETURN_NOT_HANDLED) return rc; /* handler does not care. Drop down to default handling */ } if (scmd->cmnd[0] == TEST_UNIT_READY && scmd->submitter != SUBMITTED_BY_SCSI_ERROR_HANDLER) /* * nasty: for mid-layer issued TURs, we need to return the * actual sense data without any recovery attempt. For eh * issued ones, we need to try to recover and interpret */ return SUCCESS; /* * Previous logic looked for FILEMARK, EOM or ILI which are * mainly associated with tapes and returned SUCCESS. */ if (sshdr.response_code == 0x70) { /* fixed format */ if (scmd->sense_buffer[2] & 0xe0) return SUCCESS; } else { /* * descriptor format: look for "stream commands sense data * descriptor" (see SSC-3). Assume single sense data * descriptor. Ignore ILI from SBC-2 READ LONG and WRITE LONG. */ if ((sshdr.additional_length > 3) && (scmd->sense_buffer[8] == 0x4) && (scmd->sense_buffer[11] & 0xe0)) return SUCCESS; } switch (sshdr.sense_key) { case NO_SENSE: return SUCCESS; case RECOVERED_ERROR: return /* soft_error */ SUCCESS; case ABORTED_COMMAND: if (sshdr.asc == 0x10) /* DIF */ return SUCCESS; /* * Check aborts due to command duration limit policy: * ABORTED COMMAND additional sense code with the * COMMAND TIMEOUT BEFORE PROCESSING or * COMMAND TIMEOUT DURING PROCESSING or * COMMAND TIMEOUT DURING PROCESSING DUE TO ERROR RECOVERY * additional sense code qualifiers. */ if (sshdr.asc == 0x2e && sshdr.ascq >= 0x01 && sshdr.ascq <= 0x03) { set_scsi_ml_byte(scmd, SCSIML_STAT_DL_TIMEOUT); req->cmd_flags |= REQ_FAILFAST_DEV; req->rq_flags |= RQF_QUIET; return SUCCESS; } if (sshdr.asc == 0x44 && sdev->sdev_bflags & BLIST_RETRY_ITF) return ADD_TO_MLQUEUE; if (sshdr.asc == 0xc1 && sshdr.ascq == 0x01 && sdev->sdev_bflags & BLIST_RETRY_ASC_C1) return ADD_TO_MLQUEUE; return NEEDS_RETRY; case NOT_READY: case UNIT_ATTENTION: /* * if we are expecting a cc/ua because of a bus reset that we * performed, treat this just as a retry. otherwise this is * information that we should pass up to the upper-level driver * so that we can deal with it there. */ if (scmd->device->expecting_cc_ua) { /* * Because some device does not queue unit * attentions correctly, we carefully check * additional sense code and qualifier so as * not to squash media change unit attention. */ if (sshdr.asc != 0x28 || sshdr.ascq != 0x00) { scmd->device->expecting_cc_ua = 0; return NEEDS_RETRY; } } /* * we might also expect a cc/ua if another LUN on the target * reported a UA with an ASC/ASCQ of 3F 0E - * REPORTED LUNS DATA HAS CHANGED. */ if (scmd->device->sdev_target->expecting_lun_change && sshdr.asc == 0x3f && sshdr.ascq == 0x0e) return NEEDS_RETRY; /* * if the device is in the process of becoming ready, we * should retry. */ if ((sshdr.asc == 0x04) && (sshdr.ascq == 0x01)) return NEEDS_RETRY; /* * if the device is not started, we need to wake * the error handler to start the motor */ if (scmd->device->allow_restart && (sshdr.asc == 0x04) && (sshdr.ascq == 0x02)) return FAILED; /* * Pass the UA upwards for a determination in the completion * functions. */ return SUCCESS; /* these are not supported */ case DATA_PROTECT: if (sshdr.asc == 0x27 && sshdr.ascq == 0x07) { /* Thin provisioning hard threshold reached */ set_scsi_ml_byte(scmd, SCSIML_STAT_NOSPC); return SUCCESS; } fallthrough; case COPY_ABORTED: case VOLUME_OVERFLOW: case MISCOMPARE: case BLANK_CHECK: set_scsi_ml_byte(scmd, SCSIML_STAT_TGT_FAILURE); return SUCCESS; case MEDIUM_ERROR: if (sshdr.asc == 0x11 || /* UNRECOVERED READ ERR */ sshdr.asc == 0x13 || /* AMNF DATA FIELD */ sshdr.asc == 0x14) { /* RECORD NOT FOUND */ set_scsi_ml_byte(scmd, SCSIML_STAT_MED_ERROR); return SUCCESS; } return NEEDS_RETRY; case HARDWARE_ERROR: if (scmd->device->retry_hwerror) return ADD_TO_MLQUEUE; else set_scsi_ml_byte(scmd, SCSIML_STAT_TGT_FAILURE); fallthrough; case ILLEGAL_REQUEST: if (sshdr.asc == 0x20 || /* Invalid command operation code */ sshdr.asc == 0x21 || /* Logical block address out of range */ sshdr.asc == 0x22 || /* Invalid function */ sshdr.asc == 0x24 || /* Invalid field in cdb */ sshdr.asc == 0x26 || /* Parameter value invalid */ sshdr.asc == 0x27) { /* Write protected */ set_scsi_ml_byte(scmd, SCSIML_STAT_TGT_FAILURE); } return SUCCESS; case COMPLETED: if (sshdr.asc == 0x55 && sshdr.ascq == 0x0a) { set_scsi_ml_byte(scmd, SCSIML_STAT_DL_TIMEOUT); req->cmd_flags |= REQ_FAILFAST_DEV; req->rq_flags |= RQF_QUIET; } return SUCCESS; default: return SUCCESS; } } EXPORT_SYMBOL_GPL(scsi_check_sense); static void scsi_handle_queue_ramp_up(struct scsi_device *sdev) { const struct scsi_host_template *sht = sdev->host->hostt; struct scsi_device *tmp_sdev; if (!sht->track_queue_depth || sdev->queue_depth >= sdev->max_queue_depth) return; if (time_before(jiffies, sdev->last_queue_ramp_up + sdev->queue_ramp_up_period)) return; if (time_before(jiffies, sdev->last_queue_full_time + sdev->queue_ramp_up_period)) return; /* * Walk all devices of a target and do * ramp up on them. */ shost_for_each_device(tmp_sdev, sdev->host) { if (tmp_sdev->channel != sdev->channel || tmp_sdev->id != sdev->id || tmp_sdev->queue_depth == sdev->max_queue_depth) continue; scsi_change_queue_depth(tmp_sdev, tmp_sdev->queue_depth + 1); sdev->last_queue_ramp_up = jiffies; } } static void scsi_handle_queue_full(struct scsi_device *sdev) { const struct scsi_host_template *sht = sdev->host->hostt; struct scsi_device *tmp_sdev; if (!sht->track_queue_depth) return; shost_for_each_device(tmp_sdev, sdev->host) { if (tmp_sdev->channel != sdev->channel || tmp_sdev->id != sdev->id) continue; /* * We do not know the number of commands that were at * the device when we got the queue full so we start * from the highest possible value and work our way down. */ scsi_track_queue_full(tmp_sdev, tmp_sdev->queue_depth - 1); } } /** * scsi_eh_completed_normally - Disposition a eh cmd on return from LLD. * @scmd: SCSI cmd to examine. * * Notes: * This is *only* called when we are examining the status of commands * queued during error recovery. the main difference here is that we * don't allow for the possibility of retries here, and we are a lot * more restrictive about what we consider acceptable. */ static enum scsi_disposition scsi_eh_completed_normally(struct scsi_cmnd *scmd) { /* * first check the host byte, to see if there is anything in there * that would indicate what we need to do. */ if (host_byte(scmd->result) == DID_RESET) { /* * rats. we are already in the error handler, so we now * get to try and figure out what to do next. if the sense * is valid, we have a pretty good idea of what to do. * if not, we mark it as FAILED. */ return scsi_check_sense(scmd); } if (host_byte(scmd->result) != DID_OK) return FAILED; /* * now, check the status byte to see if this indicates * anything special. */ switch (get_status_byte(scmd)) { case SAM_STAT_GOOD: scsi_handle_queue_ramp_up(scmd->device); if (scmd->sense_buffer && SCSI_SENSE_VALID(scmd)) /* * If we have sense data, call scsi_check_sense() in * order to set the correct SCSI ML byte (if any). * No point in checking the return value, since the * command has already completed successfully. */ scsi_check_sense(scmd); fallthrough; case SAM_STAT_COMMAND_TERMINATED: return SUCCESS; case SAM_STAT_CHECK_CONDITION: return scsi_check_sense(scmd); case SAM_STAT_CONDITION_MET: case SAM_STAT_INTERMEDIATE: case SAM_STAT_INTERMEDIATE_CONDITION_MET: /* * who knows? FIXME(eric) */ return SUCCESS; case SAM_STAT_RESERVATION_CONFLICT: if (scmd->cmnd[0] == TEST_UNIT_READY) /* it is a success, we probed the device and * found it */ return SUCCESS; /* otherwise, we failed to send the command */ return FAILED; case SAM_STAT_TASK_SET_FULL: scsi_handle_queue_full(scmd->device); fallthrough; case SAM_STAT_BUSY: return NEEDS_RETRY; default: return FAILED; } return FAILED; } /** * scsi_eh_done - Completion function for error handling. * @scmd: Cmd that is done. */ void scsi_eh_done(struct scsi_cmnd *scmd) { struct completion *eh_action; SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "%s result: %x\n", __func__, scmd->result)); eh_action = scmd->device->host->eh_action; if (eh_action) complete(eh_action); } /** * scsi_try_host_reset - ask host adapter to reset itself * @scmd: SCSI cmd to send host reset. */ static enum scsi_disposition scsi_try_host_reset(struct scsi_cmnd *scmd) { unsigned long flags; enum scsi_disposition rtn; struct Scsi_Host *host = scmd->device->host; const struct scsi_host_template *hostt = host->hostt; SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, host, "Snd Host RST\n")); if (!hostt->eh_host_reset_handler) return FAILED; rtn = hostt->eh_host_reset_handler(scmd); if (rtn == SUCCESS) { if (!hostt->skip_settle_delay) ssleep(HOST_RESET_SETTLE_TIME); spin_lock_irqsave(host->host_lock, flags); scsi_report_bus_reset(host, scmd_channel(scmd)); spin_unlock_irqrestore(host->host_lock, flags); } return rtn; } /** * scsi_try_bus_reset - ask host to perform a bus reset * @scmd: SCSI cmd to send bus reset. */ static enum scsi_disposition scsi_try_bus_reset(struct scsi_cmnd *scmd) { unsigned long flags; enum scsi_disposition rtn; struct Scsi_Host *host = scmd->device->host; const struct scsi_host_template *hostt = host->hostt; SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "%s: Snd Bus RST\n", __func__)); if (!hostt->eh_bus_reset_handler) return FAILED; rtn = hostt->eh_bus_reset_handler(scmd); if (rtn == SUCCESS) { if (!hostt->skip_settle_delay) ssleep(BUS_RESET_SETTLE_TIME); spin_lock_irqsave(host->host_lock, flags); scsi_report_bus_reset(host, scmd_channel(scmd)); spin_unlock_irqrestore(host->host_lock, flags); } return rtn; } static void __scsi_report_device_reset(struct scsi_device *sdev, void *data) { sdev->was_reset = 1; sdev->expecting_cc_ua = 1; } /** * scsi_try_target_reset - Ask host to perform a target reset * @scmd: SCSI cmd used to send a target reset * * Notes: * There is no timeout for this operation. if this operation is * unreliable for a given host, then the host itself needs to put a * timer on it, and set the host back to a consistent state prior to * returning. */ static enum scsi_disposition scsi_try_target_reset(struct scsi_cmnd *scmd) { unsigned long flags; enum scsi_disposition rtn; struct Scsi_Host *host = scmd->device->host; const struct scsi_host_template *hostt = host->hostt; if (!hostt->eh_target_reset_handler) return FAILED; rtn = hostt->eh_target_reset_handler(scmd); if (rtn == SUCCESS) { spin_lock_irqsave(host->host_lock, flags); __starget_for_each_device(scsi_target(scmd->device), NULL, __scsi_report_device_reset); spin_unlock_irqrestore(host->host_lock, flags); } return rtn; } /** * scsi_try_bus_device_reset - Ask host to perform a BDR on a dev * @scmd: SCSI cmd used to send BDR * * Notes: * There is no timeout for this operation. if this operation is * unreliable for a given host, then the host itself needs to put a * timer on it, and set the host back to a consistent state prior to * returning. */ static enum scsi_disposition scsi_try_bus_device_reset(struct scsi_cmnd *scmd) { enum scsi_disposition rtn; const struct scsi_host_template *hostt = scmd->device->host->hostt; if (!hostt->eh_device_reset_handler) return FAILED; rtn = hostt->eh_device_reset_handler(scmd); if (rtn == SUCCESS) __scsi_report_device_reset(scmd->device, NULL); return rtn; } /** * scsi_try_to_abort_cmd - Ask host to abort a SCSI command * @hostt: SCSI driver host template * @scmd: SCSI cmd used to send a target reset * * Return value: * SUCCESS, FAILED, or FAST_IO_FAIL * * Notes: * SUCCESS does not necessarily indicate that the command * has been aborted; it only indicates that the LLDDs * has cleared all references to that command. * LLDDs should return FAILED only if an abort was required * but could not be executed. LLDDs should return FAST_IO_FAIL * if the device is temporarily unavailable (eg due to a * link down on FibreChannel) */ static enum scsi_disposition scsi_try_to_abort_cmd(const struct scsi_host_template *hostt, struct scsi_cmnd *scmd) { if (!hostt->eh_abort_handler) return FAILED; return hostt->eh_abort_handler(scmd); } static void scsi_abort_eh_cmnd(struct scsi_cmnd *scmd) { if (scsi_try_to_abort_cmd(scmd->device->host->hostt, scmd) != SUCCESS) if (scsi_try_bus_device_reset(scmd) != SUCCESS) if (scsi_try_target_reset(scmd) != SUCCESS) if (scsi_try_bus_reset(scmd) != SUCCESS) scsi_try_host_reset(scmd); } /** * scsi_eh_prep_cmnd - Save a scsi command info as part of error recovery * @scmd: SCSI command structure to hijack * @ses: structure to save restore information * @cmnd: CDB to send. Can be NULL if no new cmnd is needed * @cmnd_size: size in bytes of @cmnd (must be <= MAX_COMMAND_SIZE) * @sense_bytes: size of sense data to copy. or 0 (if != 0 @cmnd is ignored) * * This function is used to save a scsi command information before re-execution * as part of the error recovery process. If @sense_bytes is 0 the command * sent must be one that does not transfer any data. If @sense_bytes != 0 * @cmnd is ignored and this functions sets up a REQUEST_SENSE command * and cmnd buffers to read @sense_bytes into @scmd->sense_buffer. */ void scsi_eh_prep_cmnd(struct scsi_cmnd *scmd, struct scsi_eh_save *ses, unsigned char *cmnd, int cmnd_size, unsigned sense_bytes) { struct scsi_device *sdev = scmd->device; /* * We need saved copies of a number of fields - this is because * error handling may need to overwrite these with different values * to run different commands, and once error handling is complete, * we will need to restore these values prior to running the actual * command. */ ses->cmd_len = scmd->cmd_len; ses->data_direction = scmd->sc_data_direction; ses->sdb = scmd->sdb; ses->result = scmd->result; ses->resid_len = scmd->resid_len; ses->underflow = scmd->underflow; ses->prot_op = scmd->prot_op; ses->eh_eflags = scmd->eh_eflags; scmd->prot_op = SCSI_PROT_NORMAL; scmd->eh_eflags = 0; memcpy(ses->cmnd, scmd->cmnd, sizeof(ses->cmnd)); memset(scmd->cmnd, 0, sizeof(scmd->cmnd)); memset(&scmd->sdb, 0, sizeof(scmd->sdb)); scmd->result = 0; scmd->resid_len = 0; if (sense_bytes) { scmd->sdb.length = min_t(unsigned, SCSI_SENSE_BUFFERSIZE, sense_bytes); sg_init_one(&ses->sense_sgl, scmd->sense_buffer, scmd->sdb.length); scmd->sdb.table.sgl = &ses->sense_sgl; scmd->sc_data_direction = DMA_FROM_DEVICE; scmd->sdb.table.nents = scmd->sdb.table.orig_nents = 1; scmd->cmnd[0] = REQUEST_SENSE; scmd->cmnd[4] = scmd->sdb.length; scmd->cmd_len = COMMAND_SIZE(scmd->cmnd[0]); } else { scmd->sc_data_direction = DMA_NONE; if (cmnd) { BUG_ON(cmnd_size > sizeof(scmd->cmnd)); memcpy(scmd->cmnd, cmnd, cmnd_size); scmd->cmd_len = COMMAND_SIZE(scmd->cmnd[0]); } } scmd->underflow = 0; if (sdev->scsi_level <= SCSI_2 && sdev->scsi_level != SCSI_UNKNOWN) scmd->cmnd[1] = (scmd->cmnd[1] & 0x1f) | (sdev->lun << 5 & 0xe0); /* * Zero the sense buffer. The scsi spec mandates that any * untransferred sense data should be interpreted as being zero. */ memset(scmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); } EXPORT_SYMBOL(scsi_eh_prep_cmnd); /** * scsi_eh_restore_cmnd - Restore a scsi command info as part of error recovery * @scmd: SCSI command structure to restore * @ses: saved information from a coresponding call to scsi_eh_prep_cmnd * * Undo any damage done by above scsi_eh_prep_cmnd(). */ void scsi_eh_restore_cmnd(struct scsi_cmnd* scmd, struct scsi_eh_save *ses) { /* * Restore original data */ scmd->cmd_len = ses->cmd_len; memcpy(scmd->cmnd, ses->cmnd, sizeof(ses->cmnd)); scmd->sc_data_direction = ses->data_direction; scmd->sdb = ses->sdb; scmd->result = ses->result; scmd->resid_len = ses->resid_len; scmd->underflow = ses->underflow; scmd->prot_op = ses->prot_op; scmd->eh_eflags = ses->eh_eflags; } EXPORT_SYMBOL(scsi_eh_restore_cmnd); /** * scsi_send_eh_cmnd - submit a scsi command as part of error recovery * @scmd: SCSI command structure to hijack * @cmnd: CDB to send * @cmnd_size: size in bytes of @cmnd * @timeout: timeout for this request * @sense_bytes: size of sense data to copy or 0 * * This function is used to send a scsi command down to a target device * as part of the error recovery process. See also scsi_eh_prep_cmnd() above. * * Return value: * SUCCESS or FAILED or NEEDS_RETRY */ static enum scsi_disposition scsi_send_eh_cmnd(struct scsi_cmnd *scmd, unsigned char *cmnd, int cmnd_size, int timeout, unsigned sense_bytes) { struct scsi_device *sdev = scmd->device; struct Scsi_Host *shost = sdev->host; DECLARE_COMPLETION_ONSTACK(done); unsigned long timeleft = timeout, delay; struct scsi_eh_save ses; const unsigned long stall_for = msecs_to_jiffies(100); int rtn; retry: scsi_eh_prep_cmnd(scmd, &ses, cmnd, cmnd_size, sense_bytes); shost->eh_action = &done; scsi_log_send(scmd); scmd->submitter = SUBMITTED_BY_SCSI_ERROR_HANDLER; scmd->flags |= SCMD_LAST; /* * Lock sdev->state_mutex to avoid that scsi_device_quiesce() can * change the SCSI device state after we have examined it and before * .queuecommand() is called. */ mutex_lock(&sdev->state_mutex); while (sdev->sdev_state == SDEV_BLOCK && timeleft > 0) { mutex_unlock(&sdev->state_mutex); SCSI_LOG_ERROR_RECOVERY(5, sdev_printk(KERN_DEBUG, sdev, "%s: state %d <> %d\n", __func__, sdev->sdev_state, SDEV_BLOCK)); delay = min(timeleft, stall_for); timeleft -= delay; msleep(jiffies_to_msecs(delay)); mutex_lock(&sdev->state_mutex); } if (sdev->sdev_state != SDEV_BLOCK) rtn = shost->hostt->queuecommand(shost, scmd); else rtn = FAILED; mutex_unlock(&sdev->state_mutex); if (rtn) { if (timeleft > stall_for) { scsi_eh_restore_cmnd(scmd, &ses); timeleft -= stall_for; msleep(jiffies_to_msecs(stall_for)); goto retry; } /* signal not to enter either branch of the if () below */ timeleft = 0; rtn = FAILED; } else { timeleft = wait_for_completion_timeout(&done, timeout); rtn = SUCCESS; } shost->eh_action = NULL; scsi_log_completion(scmd, rtn); SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "%s timeleft: %ld\n", __func__, timeleft)); /* * If there is time left scsi_eh_done got called, and we will examine * the actual status codes to see whether the command actually did * complete normally, else if we have a zero return and no time left, * the command must still be pending, so abort it and return FAILED. * If we never actually managed to issue the command, because * ->queuecommand() kept returning non zero, use the rtn = FAILED * value above (so don't execute either branch of the if) */ if (timeleft) { rtn = scsi_eh_completed_normally(scmd); SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "%s: scsi_eh_completed_normally %x\n", __func__, rtn)); switch (rtn) { case SUCCESS: case NEEDS_RETRY: case FAILED: break; case ADD_TO_MLQUEUE: rtn = NEEDS_RETRY; break; default: rtn = FAILED; break; } } else if (rtn != FAILED) { scsi_abort_eh_cmnd(scmd); rtn = FAILED; } scsi_eh_restore_cmnd(scmd, &ses); return rtn; } /** * scsi_request_sense - Request sense data from a particular target. * @scmd: SCSI cmd for request sense. * * Notes: * Some hosts automatically obtain this information, others require * that we obtain it on our own. This function will *not* return until * the command either times out, or it completes. */ static enum scsi_disposition scsi_request_sense(struct scsi_cmnd *scmd) { return scsi_send_eh_cmnd(scmd, NULL, 0, scmd->device->eh_timeout, ~0); } static enum scsi_disposition scsi_eh_action(struct scsi_cmnd *scmd, enum scsi_disposition rtn) { if (!blk_rq_is_passthrough(scsi_cmd_to_rq(scmd))) { struct scsi_driver *sdrv = scsi_cmd_to_driver(scmd); if (sdrv->eh_action) rtn = sdrv->eh_action(scmd, rtn); } return rtn; } /** * scsi_eh_finish_cmd - Handle a cmd that eh is finished with. * @scmd: Original SCSI cmd that eh has finished. * @done_q: Queue for processed commands. * * Notes: * We don't want to use the normal command completion while we are are * still handling errors - it may cause other commands to be queued, * and that would disturb what we are doing. Thus we really want to * keep a list of pending commands for final completion, and once we * are ready to leave error handling we handle completion for real. */ void scsi_eh_finish_cmd(struct scsi_cmnd *scmd, struct list_head *done_q) { list_move_tail(&scmd->eh_entry, done_q); } EXPORT_SYMBOL(scsi_eh_finish_cmd); /** * scsi_eh_get_sense - Get device sense data. * @work_q: Queue of commands to process. * @done_q: Queue of processed commands. * * Description: * See if we need to request sense information. if so, then get it * now, so we have a better idea of what to do. * * Notes: * This has the unfortunate side effect that if a shost adapter does * not automatically request sense information, we end up shutting * it down before we request it. * * All drivers should request sense information internally these days, * so for now all I have to say is tough noogies if you end up in here. * * XXX: Long term this code should go away, but that needs an audit of * all LLDDs first. */ int scsi_eh_get_sense(struct list_head *work_q, struct list_head *done_q) { struct scsi_cmnd *scmd, *next; struct Scsi_Host *shost; enum scsi_disposition rtn; /* * If SCSI_EH_ABORT_SCHEDULED has been set, it is timeout IO, * should not get sense. */ list_for_each_entry_safe(scmd, next, work_q, eh_entry) { if ((scmd->eh_eflags & SCSI_EH_ABORT_SCHEDULED) || SCSI_SENSE_VALID(scmd)) continue; shost = scmd->device->host; if (scsi_host_eh_past_deadline(shost)) { SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "%s: skip request sense, past eh deadline\n", current->comm)); break; } if (!scsi_status_is_check_condition(scmd->result)) /* * don't request sense if there's no check condition * status because the error we're processing isn't one * that has a sense code (and some devices get * confused by sense requests out of the blue) */ continue; SCSI_LOG_ERROR_RECOVERY(2, scmd_printk(KERN_INFO, scmd, "%s: requesting sense\n", current->comm)); rtn = scsi_request_sense(scmd); if (rtn != SUCCESS) continue; SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "sense requested, result %x\n", scmd->result)); SCSI_LOG_ERROR_RECOVERY(3, scsi_print_sense(scmd)); rtn = scsi_decide_disposition(scmd); /* * if the result was normal, then just pass it along to the * upper level. */ if (rtn == SUCCESS) /* * We don't want this command reissued, just finished * with the sense data, so set retries to the max * allowed to ensure it won't get reissued. If the user * has requested infinite retries, we also want to * finish this command, so force completion by setting * retries and allowed to the same value. */ if (scmd->allowed == SCSI_CMD_RETRIES_NO_LIMIT) scmd->retries = scmd->allowed = 1; else scmd->retries = scmd->allowed; else if (rtn != NEEDS_RETRY) continue; scsi_eh_finish_cmd(scmd, done_q); } return list_empty(work_q); } EXPORT_SYMBOL_GPL(scsi_eh_get_sense); /** * scsi_eh_tur - Send TUR to device. * @scmd: &scsi_cmnd to send TUR * * Return value: * 0 - Device is ready. 1 - Device NOT ready. */ static int scsi_eh_tur(struct scsi_cmnd *scmd) { static unsigned char tur_command[6] = {TEST_UNIT_READY, 0, 0, 0, 0, 0}; int retry_cnt = 1; enum scsi_disposition rtn; retry_tur: rtn = scsi_send_eh_cmnd(scmd, tur_command, 6, scmd->device->eh_timeout, 0); SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "%s return: %x\n", __func__, rtn)); switch (rtn) { case NEEDS_RETRY: if (retry_cnt--) goto retry_tur; fallthrough; case SUCCESS: return 0; default: return 1; } } /** * scsi_eh_test_devices - check if devices are responding from error recovery. * @cmd_list: scsi commands in error recovery. * @work_q: queue for commands which still need more error recovery * @done_q: queue for commands which are finished * @try_stu: boolean on if a STU command should be tried in addition to TUR. * * Decription: * Tests if devices are in a working state. Commands to devices now in * a working state are sent to the done_q while commands to devices which * are still failing to respond are returned to the work_q for more * processing. **/ static int scsi_eh_test_devices(struct list_head *cmd_list, struct list_head *work_q, struct list_head *done_q, int try_stu) { struct scsi_cmnd *scmd, *next; struct scsi_device *sdev; int finish_cmds; while (!list_empty(cmd_list)) { scmd = list_entry(cmd_list->next, struct scsi_cmnd, eh_entry); sdev = scmd->device; if (!try_stu) { if (scsi_host_eh_past_deadline(sdev->host)) { /* Push items back onto work_q */ list_splice_init(cmd_list, work_q); SCSI_LOG_ERROR_RECOVERY(3, sdev_printk(KERN_INFO, sdev, "%s: skip test device, past eh deadline", current->comm)); break; } } finish_cmds = !scsi_device_online(scmd->device) || (try_stu && !scsi_eh_try_stu(scmd) && !scsi_eh_tur(scmd)) || !scsi_eh_tur(scmd); list_for_each_entry_safe(scmd, next, cmd_list, eh_entry) if (scmd->device == sdev) { if (finish_cmds && (try_stu || scsi_eh_action(scmd, SUCCESS) == SUCCESS)) scsi_eh_finish_cmd(scmd, done_q); else list_move_tail(&scmd->eh_entry, work_q); } } return list_empty(work_q); } /** * scsi_eh_try_stu - Send START_UNIT to device. * @scmd: &scsi_cmnd to send START_UNIT * * Return value: * 0 - Device is ready. 1 - Device NOT ready. */ static int scsi_eh_try_stu(struct scsi_cmnd *scmd) { static unsigned char stu_command[6] = {START_STOP, 0, 0, 0, 1, 0}; if (scmd->device->allow_restart) { int i; enum scsi_disposition rtn = NEEDS_RETRY; for (i = 0; rtn == NEEDS_RETRY && i < 2; i++) rtn = scsi_send_eh_cmnd(scmd, stu_command, 6, scmd->device->eh_timeout, 0); if (rtn == SUCCESS) return 0; } return 1; } /** * scsi_eh_stu - send START_UNIT if needed * @shost: &scsi host being recovered. * @work_q: &list_head for pending commands. * @done_q: &list_head for processed commands. * * Notes: * If commands are failing due to not ready, initializing command required, * try revalidating the device, which will end up sending a start unit. */ static int scsi_eh_stu(struct Scsi_Host *shost, struct list_head *work_q, struct list_head *done_q) { struct scsi_cmnd *scmd, *stu_scmd, *next; struct scsi_device *sdev; shost_for_each_device(sdev, shost) { if (scsi_host_eh_past_deadline(shost)) { SCSI_LOG_ERROR_RECOVERY(3, sdev_printk(KERN_INFO, sdev, "%s: skip START_UNIT, past eh deadline\n", current->comm)); scsi_device_put(sdev); break; } stu_scmd = NULL; list_for_each_entry(scmd, work_q, eh_entry) if (scmd->device == sdev && SCSI_SENSE_VALID(scmd) && scsi_check_sense(scmd) == FAILED ) { stu_scmd = scmd; break; } if (!stu_scmd) continue; SCSI_LOG_ERROR_RECOVERY(3, sdev_printk(KERN_INFO, sdev, "%s: Sending START_UNIT\n", current->comm)); if (!scsi_eh_try_stu(stu_scmd)) { if (!scsi_device_online(sdev) || !scsi_eh_tur(stu_scmd)) { list_for_each_entry_safe(scmd, next, work_q, eh_entry) { if (scmd->device == sdev && scsi_eh_action(scmd, SUCCESS) == SUCCESS) scsi_eh_finish_cmd(scmd, done_q); } } } else { SCSI_LOG_ERROR_RECOVERY(3, sdev_printk(KERN_INFO, sdev, "%s: START_UNIT failed\n", current->comm)); } } return list_empty(work_q); } /** * scsi_eh_bus_device_reset - send bdr if needed * @shost: scsi host being recovered. * @work_q: &list_head for pending commands. * @done_q: &list_head for processed commands. * * Notes: * Try a bus device reset. Still, look to see whether we have multiple * devices that are jammed or not - if we have multiple devices, it * makes no sense to try bus_device_reset - we really would need to try * a bus_reset instead. */ static int scsi_eh_bus_device_reset(struct Scsi_Host *shost, struct list_head *work_q, struct list_head *done_q) { struct scsi_cmnd *scmd, *bdr_scmd, *next; struct scsi_device *sdev; enum scsi_disposition rtn; shost_for_each_device(sdev, shost) { if (scsi_host_eh_past_deadline(shost)) { SCSI_LOG_ERROR_RECOVERY(3, sdev_printk(KERN_INFO, sdev, "%s: skip BDR, past eh deadline\n", current->comm)); scsi_device_put(sdev); break; } bdr_scmd = NULL; list_for_each_entry(scmd, work_q, eh_entry) if (scmd->device == sdev) { bdr_scmd = scmd; break; } if (!bdr_scmd) continue; SCSI_LOG_ERROR_RECOVERY(3, sdev_printk(KERN_INFO, sdev, "%s: Sending BDR\n", current->comm)); rtn = scsi_try_bus_device_reset(bdr_scmd); if (rtn == SUCCESS || rtn == FAST_IO_FAIL) { if (!scsi_device_online(sdev) || rtn == FAST_IO_FAIL || !scsi_eh_tur(bdr_scmd)) { list_for_each_entry_safe(scmd, next, work_q, eh_entry) { if (scmd->device == sdev && scsi_eh_action(scmd, rtn) != FAILED) scsi_eh_finish_cmd(scmd, done_q); } } } else { SCSI_LOG_ERROR_RECOVERY(3, sdev_printk(KERN_INFO, sdev, "%s: BDR failed\n", current->comm)); } } return list_empty(work_q); } /** * scsi_eh_target_reset - send target reset if needed * @shost: scsi host being recovered. * @work_q: &list_head for pending commands. * @done_q: &list_head for processed commands. * * Notes: * Try a target reset. */ static int scsi_eh_target_reset(struct Scsi_Host *shost, struct list_head *work_q, struct list_head *done_q) { LIST_HEAD(tmp_list); LIST_HEAD(check_list); list_splice_init(work_q, &tmp_list); while (!list_empty(&tmp_list)) { struct scsi_cmnd *next, *scmd; enum scsi_disposition rtn; unsigned int id; if (scsi_host_eh_past_deadline(shost)) { /* push back on work queue for further processing */ list_splice_init(&check_list, work_q); list_splice_init(&tmp_list, work_q); SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "%s: Skip target reset, past eh deadline\n", current->comm)); return list_empty(work_q); } scmd = list_entry(tmp_list.next, struct scsi_cmnd, eh_entry); id = scmd_id(scmd); SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "%s: Sending target reset to target %d\n", current->comm, id)); rtn = scsi_try_target_reset(scmd); if (rtn != SUCCESS && rtn != FAST_IO_FAIL) SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "%s: Target reset failed" " target: %d\n", current->comm, id)); list_for_each_entry_safe(scmd, next, &tmp_list, eh_entry) { if (scmd_id(scmd) != id) continue; if (rtn == SUCCESS) list_move_tail(&scmd->eh_entry, &check_list); else if (rtn == FAST_IO_FAIL) scsi_eh_finish_cmd(scmd, done_q); else /* push back on work queue for further processing */ list_move(&scmd->eh_entry, work_q); } } return scsi_eh_test_devices(&check_list, work_q, done_q, 0); } /** * scsi_eh_bus_reset - send a bus reset * @shost: &scsi host being recovered. * @work_q: &list_head for pending commands. * @done_q: &list_head for processed commands. */ static int scsi_eh_bus_reset(struct Scsi_Host *shost, struct list_head *work_q, struct list_head *done_q) { struct scsi_cmnd *scmd, *chan_scmd, *next; LIST_HEAD(check_list); unsigned int channel; enum scsi_disposition rtn; /* * we really want to loop over the various channels, and do this on * a channel by channel basis. we should also check to see if any * of the failed commands are on soft_reset devices, and if so, skip * the reset. */ for (channel = 0; channel <= shost->max_channel; channel++) { if (scsi_host_eh_past_deadline(shost)) { list_splice_init(&check_list, work_q); SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "%s: skip BRST, past eh deadline\n", current->comm)); return list_empty(work_q); } chan_scmd = NULL; list_for_each_entry(scmd, work_q, eh_entry) { if (channel == scmd_channel(scmd)) { chan_scmd = scmd; break; /* * FIXME add back in some support for * soft_reset devices. */ } } if (!chan_scmd) continue; SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "%s: Sending BRST chan: %d\n", current->comm, channel)); rtn = scsi_try_bus_reset(chan_scmd); if (rtn == SUCCESS || rtn == FAST_IO_FAIL) { list_for_each_entry_safe(scmd, next, work_q, eh_entry) { if (channel == scmd_channel(scmd)) { if (rtn == FAST_IO_FAIL) scsi_eh_finish_cmd(scmd, done_q); else list_move_tail(&scmd->eh_entry, &check_list); } } } else { SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "%s: BRST failed chan: %d\n", current->comm, channel)); } } return scsi_eh_test_devices(&check_list, work_q, done_q, 0); } /** * scsi_eh_host_reset - send a host reset * @shost: host to be reset. * @work_q: &list_head for pending commands. * @done_q: &list_head for processed commands. */ static int scsi_eh_host_reset(struct Scsi_Host *shost, struct list_head *work_q, struct list_head *done_q) { struct scsi_cmnd *scmd, *next; LIST_HEAD(check_list); enum scsi_disposition rtn; if (!list_empty(work_q)) { scmd = list_entry(work_q->next, struct scsi_cmnd, eh_entry); SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "%s: Sending HRST\n", current->comm)); rtn = scsi_try_host_reset(scmd); if (rtn == SUCCESS) { list_splice_init(work_q, &check_list); } else if (rtn == FAST_IO_FAIL) { list_for_each_entry_safe(scmd, next, work_q, eh_entry) { scsi_eh_finish_cmd(scmd, done_q); } } else { SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "%s: HRST failed\n", current->comm)); } } return scsi_eh_test_devices(&check_list, work_q, done_q, 1); } /** * scsi_eh_offline_sdevs - offline scsi devices that fail to recover * @work_q: &list_head for pending commands. * @done_q: &list_head for processed commands. */ static void scsi_eh_offline_sdevs(struct list_head *work_q, struct list_head *done_q) { struct scsi_cmnd *scmd, *next; struct scsi_device *sdev; list_for_each_entry_safe(scmd, next, work_q, eh_entry) { sdev_printk(KERN_INFO, scmd->device, "Device offlined - " "not ready after error recovery\n"); sdev = scmd->device; mutex_lock(&sdev->state_mutex); scsi_device_set_state(sdev, SDEV_OFFLINE); mutex_unlock(&sdev->state_mutex); scsi_eh_finish_cmd(scmd, done_q); } return; } /** * scsi_noretry_cmd - determine if command should be failed fast * @scmd: SCSI cmd to examine. */ bool scsi_noretry_cmd(struct scsi_cmnd *scmd) { struct request *req = scsi_cmd_to_rq(scmd); switch (host_byte(scmd->result)) { case DID_OK: break; case DID_TIME_OUT: goto check_type; case DID_BUS_BUSY: return !!(req->cmd_flags & REQ_FAILFAST_TRANSPORT); case DID_PARITY: return !!(req->cmd_flags & REQ_FAILFAST_DEV); case DID_ERROR: if (get_status_byte(scmd) == SAM_STAT_RESERVATION_CONFLICT) return false; fallthrough; case DID_SOFT_ERROR: return !!(req->cmd_flags & REQ_FAILFAST_DRIVER); } /* Never retry commands aborted due to a duration limit timeout */ if (scsi_ml_byte(scmd->result) == SCSIML_STAT_DL_TIMEOUT) return true; if (!scsi_status_is_check_condition(scmd->result)) return false; check_type: /* * assume caller has checked sense and determined * the check condition was retryable. */ if (req->cmd_flags & REQ_FAILFAST_DEV || blk_rq_is_passthrough(req)) return true; return false; } /** * scsi_decide_disposition - Disposition a cmd on return from LLD. * @scmd: SCSI cmd to examine. * * Notes: * This is *only* called when we are examining the status after sending * out the actual data command. any commands that are queued for error * recovery (e.g. test_unit_ready) do *not* come through here. * * When this routine returns failed, it means the error handler thread * is woken. In cases where the error code indicates an error that * doesn't require the error handler read (i.e. we don't need to * abort/reset), this function should return SUCCESS. */ enum scsi_disposition scsi_decide_disposition(struct scsi_cmnd *scmd) { enum scsi_disposition rtn; /* * if the device is offline, then we clearly just pass the result back * up to the top level. */ if (!scsi_device_online(scmd->device)) { SCSI_LOG_ERROR_RECOVERY(5, scmd_printk(KERN_INFO, scmd, "%s: device offline - report as SUCCESS\n", __func__)); return SUCCESS; } /* * first check the host byte, to see if there is anything in there * that would indicate what we need to do. */ switch (host_byte(scmd->result)) { case DID_PASSTHROUGH: /* * no matter what, pass this through to the upper layer. * nuke this special code so that it looks like we are saying * did_ok. */ scmd->result &= 0xff00ffff; return SUCCESS; case DID_OK: /* * looks good. drop through, and check the next byte. */ break; case DID_ABORT: if (scmd->eh_eflags & SCSI_EH_ABORT_SCHEDULED) { set_host_byte(scmd, DID_TIME_OUT); return SUCCESS; } fallthrough; case DID_NO_CONNECT: case DID_BAD_TARGET: /* * note - this means that we just report the status back * to the top level driver, not that we actually think * that it indicates SUCCESS. */ return SUCCESS; case DID_SOFT_ERROR: /* * when the low level driver returns did_soft_error, * it is responsible for keeping an internal retry counter * in order to avoid endless loops (db) */ goto maybe_retry; case DID_IMM_RETRY: return NEEDS_RETRY; case DID_REQUEUE: return ADD_TO_MLQUEUE; case DID_TRANSPORT_DISRUPTED: /* * LLD/transport was disrupted during processing of the IO. * The transport class is now blocked/blocking, * and the transport will decide what to do with the IO * based on its timers and recovery capablilities if * there are enough retries. */ goto maybe_retry; case DID_TRANSPORT_FAILFAST: /* * The transport decided to failfast the IO (most likely * the fast io fail tmo fired), so send IO directly upwards. */ return SUCCESS; case DID_TRANSPORT_MARGINAL: /* * caller has decided not to do retries on * abort success, so send IO directly upwards */ return SUCCESS; case DID_ERROR: if (get_status_byte(scmd) == SAM_STAT_RESERVATION_CONFLICT) /* * execute reservation conflict processing code * lower down */ break; fallthrough; case DID_BUS_BUSY: case DID_PARITY: goto maybe_retry; case DID_TIME_OUT: /* * when we scan the bus, we get timeout messages for * these commands if there is no device available. * other hosts report did_no_connect for the same thing. */ if ((scmd->cmnd[0] == TEST_UNIT_READY || scmd->cmnd[0] == INQUIRY)) { return SUCCESS; } else { return FAILED; } case DID_RESET: return SUCCESS; default: return FAILED; } /* * check the status byte to see if this indicates anything special. */ switch (get_status_byte(scmd)) { case SAM_STAT_TASK_SET_FULL: scsi_handle_queue_full(scmd->device); /* * the case of trying to send too many commands to a * tagged queueing device. */ fallthrough; case SAM_STAT_BUSY: /* * device can't talk to us at the moment. Should only * occur (SAM-3) when the task queue is empty, so will cause * the empty queue handling to trigger a stall in the * device. */ return ADD_TO_MLQUEUE; case SAM_STAT_GOOD: if (scmd->cmnd[0] == REPORT_LUNS) scmd->device->sdev_target->expecting_lun_change = 0; scsi_handle_queue_ramp_up(scmd->device); if (scmd->sense_buffer && SCSI_SENSE_VALID(scmd)) /* * If we have sense data, call scsi_check_sense() in * order to set the correct SCSI ML byte (if any). * No point in checking the return value, since the * command has already completed successfully. */ scsi_check_sense(scmd); fallthrough; case SAM_STAT_COMMAND_TERMINATED: return SUCCESS; case SAM_STAT_TASK_ABORTED: goto maybe_retry; case SAM_STAT_CHECK_CONDITION: rtn = scsi_check_sense(scmd); if (rtn == NEEDS_RETRY) goto maybe_retry; /* if rtn == FAILED, we have no sense information; * returning FAILED will wake the error handler thread * to collect the sense and redo the decide * disposition */ return rtn; case SAM_STAT_CONDITION_MET: case SAM_STAT_INTERMEDIATE: case SAM_STAT_INTERMEDIATE_CONDITION_MET: case SAM_STAT_ACA_ACTIVE: /* * who knows? FIXME(eric) */ return SUCCESS; case SAM_STAT_RESERVATION_CONFLICT: sdev_printk(KERN_INFO, scmd->device, "reservation conflict\n"); set_scsi_ml_byte(scmd, SCSIML_STAT_RESV_CONFLICT); return SUCCESS; /* causes immediate i/o error */ } return FAILED; maybe_retry: /* we requeue for retry because the error was retryable, and * the request was not marked fast fail. Note that above, * even if the request is marked fast fail, we still requeue * for queue congestion conditions (QUEUE_FULL or BUSY) */ if (scsi_cmd_retry_allowed(scmd) && !scsi_noretry_cmd(scmd)) { return NEEDS_RETRY; } else { /* * no more retries - report this one back to upper level. */ return SUCCESS; } } static enum rq_end_io_ret eh_lock_door_done(struct request *req, blk_status_t status) { blk_mq_free_request(req); return RQ_END_IO_NONE; } /** * scsi_eh_lock_door - Prevent medium removal for the specified device * @sdev: SCSI device to prevent medium removal * * Locking: * We must be called from process context. * * Notes: * We queue up an asynchronous "ALLOW MEDIUM REMOVAL" request on the * head of the devices request queue, and continue. */ static void scsi_eh_lock_door(struct scsi_device *sdev) { struct scsi_cmnd *scmd; struct request *req; req = scsi_alloc_request(sdev->request_queue, REQ_OP_DRV_IN, 0); if (IS_ERR(req)) return; scmd = blk_mq_rq_to_pdu(req); scmd->cmnd[0] = ALLOW_MEDIUM_REMOVAL; scmd->cmnd[1] = 0; scmd->cmnd[2] = 0; scmd->cmnd[3] = 0; scmd->cmnd[4] = SCSI_REMOVAL_PREVENT; scmd->cmnd[5] = 0; scmd->cmd_len = COMMAND_SIZE(scmd->cmnd[0]); scmd->allowed = 5; req->rq_flags |= RQF_QUIET; req->timeout = 10 * HZ; req->end_io = eh_lock_door_done; blk_execute_rq_nowait(req, true); } /** * scsi_restart_operations - restart io operations to the specified host. * @shost: Host we are restarting. * * Notes: * When we entered the error handler, we blocked all further i/o to * this device. we need to 'reverse' this process. */ static void scsi_restart_operations(struct Scsi_Host *shost) { struct scsi_device *sdev; unsigned long flags; /* * If the door was locked, we need to insert a door lock request * onto the head of the SCSI request queue for the device. There * is no point trying to lock the door of an off-line device. */ shost_for_each_device(sdev, shost) { if (scsi_device_online(sdev) && sdev->was_reset && sdev->locked) { scsi_eh_lock_door(sdev); sdev->was_reset = 0; } } /* * next free up anything directly waiting upon the host. this * will be requests for character device operations, and also for * ioctls to queued block devices. */ SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "waking up host to restart\n")); spin_lock_irqsave(shost->host_lock, flags); if (scsi_host_set_state(shost, SHOST_RUNNING)) if (scsi_host_set_state(shost, SHOST_CANCEL)) BUG_ON(scsi_host_set_state(shost, SHOST_DEL)); spin_unlock_irqrestore(shost->host_lock, flags); wake_up(&shost->host_wait); /* * finally we need to re-initiate requests that may be pending. we will * have had everything blocked while error handling is taking place, and * now that error recovery is done, we will need to ensure that these * requests are started. */ scsi_run_host_queues(shost); /* * if eh is active and host_eh_scheduled is pending we need to re-run * recovery. we do this check after scsi_run_host_queues() to allow * everything pent up since the last eh run a chance to make forward * progress before we sync again. Either we'll immediately re-run * recovery or scsi_device_unbusy() will wake us again when these * pending commands complete. */ spin_lock_irqsave(shost->host_lock, flags); if (shost->host_eh_scheduled) if (scsi_host_set_state(shost, SHOST_RECOVERY)) WARN_ON(scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY)); spin_unlock_irqrestore(shost->host_lock, flags); } /** * scsi_eh_ready_devs - check device ready state and recover if not. * @shost: host to be recovered. * @work_q: &list_head for pending commands. * @done_q: &list_head for processed commands. */ void scsi_eh_ready_devs(struct Scsi_Host *shost, struct list_head *work_q, struct list_head *done_q) { if (!scsi_eh_stu(shost, work_q, done_q)) if (!scsi_eh_bus_device_reset(shost, work_q, done_q)) if (!scsi_eh_target_reset(shost, work_q, done_q)) if (!scsi_eh_bus_reset(shost, work_q, done_q)) if (!scsi_eh_host_reset(shost, work_q, done_q)) scsi_eh_offline_sdevs(work_q, done_q); } EXPORT_SYMBOL_GPL(scsi_eh_ready_devs); /** * scsi_eh_flush_done_q - finish processed commands or retry them. * @done_q: list_head of processed commands. */ void scsi_eh_flush_done_q(struct list_head *done_q) { struct scsi_cmnd *scmd, *next; list_for_each_entry_safe(scmd, next, done_q, eh_entry) { struct scsi_device *sdev = scmd->device; list_del_init(&scmd->eh_entry); if (scsi_device_online(sdev) && !scsi_noretry_cmd(scmd) && scsi_cmd_retry_allowed(scmd) && scsi_eh_should_retry_cmd(scmd)) { SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "%s: flush retry cmd\n", current->comm)); scsi_queue_insert(scmd, SCSI_MLQUEUE_EH_RETRY); blk_mq_kick_requeue_list(sdev->request_queue); } else { /* * If just we got sense for the device (called * scsi_eh_get_sense), scmd->result is already * set, do not set DID_TIME_OUT. */ if (!scmd->result && !(scmd->flags & SCMD_FORCE_EH_SUCCESS)) scmd->result |= (DID_TIME_OUT << 16); SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "%s: flush finish cmd\n", current->comm)); scsi_finish_command(scmd); } } } EXPORT_SYMBOL(scsi_eh_flush_done_q); /** * scsi_unjam_host - Attempt to fix a host which has a cmd that failed. * @shost: Host to unjam. * * Notes: * When we come in here, we *know* that all commands on the bus have * either completed, failed or timed out. we also know that no further * commands are being sent to the host, so things are relatively quiet * and we have freedom to fiddle with things as we wish. * * This is only the *default* implementation. it is possible for * individual drivers to supply their own version of this function, and * if the maintainer wishes to do this, it is strongly suggested that * this function be taken as a template and modified. this function * was designed to correctly handle problems for about 95% of the * different cases out there, and it should always provide at least a * reasonable amount of error recovery. * * Any command marked 'failed' or 'timeout' must eventually have * scsi_finish_cmd() called for it. we do all of the retry stuff * here, so when we restart the host after we return it should have an * empty queue. */ static void scsi_unjam_host(struct Scsi_Host *shost) { unsigned long flags; LIST_HEAD(eh_work_q); LIST_HEAD(eh_done_q); spin_lock_irqsave(shost->host_lock, flags); list_splice_init(&shost->eh_cmd_q, &eh_work_q); spin_unlock_irqrestore(shost->host_lock, flags); SCSI_LOG_ERROR_RECOVERY(1, scsi_eh_prt_fail_stats(shost, &eh_work_q)); if (!scsi_eh_get_sense(&eh_work_q, &eh_done_q)) scsi_eh_ready_devs(shost, &eh_work_q, &eh_done_q); spin_lock_irqsave(shost->host_lock, flags); if (shost->eh_deadline != -1) shost->last_reset = 0; spin_unlock_irqrestore(shost->host_lock, flags); scsi_eh_flush_done_q(&eh_done_q); } /** * scsi_error_handler - SCSI error handler thread * @data: Host for which we are running. * * Notes: * This is the main error handling loop. This is run as a kernel thread * for every SCSI host and handles all error handling activity. */ int scsi_error_handler(void *data) { struct Scsi_Host *shost = data; /* * We use TASK_INTERRUPTIBLE so that the thread is not * counted against the load average as a running process. * We never actually get interrupted because kthread_run * disables signal delivery for the created thread. */ while (true) { /* * The sequence in kthread_stop() sets the stop flag first * then wakes the process. To avoid missed wakeups, the task * should always be in a non running state before the stop * flag is checked */ set_current_state(TASK_INTERRUPTIBLE); if (kthread_should_stop()) break; if ((shost->host_failed == 0 && shost->host_eh_scheduled == 0) || shost->host_failed != scsi_host_busy(shost)) { SCSI_LOG_ERROR_RECOVERY(1, shost_printk(KERN_INFO, shost, "scsi_eh_%d: sleeping\n", shost->host_no)); schedule(); continue; } __set_current_state(TASK_RUNNING); SCSI_LOG_ERROR_RECOVERY(1, shost_printk(KERN_INFO, shost, "scsi_eh_%d: waking up %d/%d/%d\n", shost->host_no, shost->host_eh_scheduled, shost->host_failed, scsi_host_busy(shost))); /* * We have a host that is failing for some reason. Figure out * what we need to do to get it up and online again (if we can). * If we fail, we end up taking the thing offline. */ if (!shost->eh_noresume && scsi_autopm_get_host(shost) != 0) { SCSI_LOG_ERROR_RECOVERY(1, shost_printk(KERN_ERR, shost, "scsi_eh_%d: unable to autoresume\n", shost->host_no)); continue; } if (shost->transportt->eh_strategy_handler) shost->transportt->eh_strategy_handler(shost); else scsi_unjam_host(shost); /* All scmds have been handled */ shost->host_failed = 0; /* * Note - if the above fails completely, the action is to take * individual devices offline and flush the queue of any * outstanding requests that may have been pending. When we * restart, we restart any I/O to any other devices on the bus * which are still online. */ scsi_restart_operations(shost); if (!shost->eh_noresume) scsi_autopm_put_host(shost); } __set_current_state(TASK_RUNNING); SCSI_LOG_ERROR_RECOVERY(1, shost_printk(KERN_INFO, shost, "Error handler scsi_eh_%d exiting\n", shost->host_no)); shost->ehandler = NULL; return 0; } /* * Function: scsi_report_bus_reset() * * Purpose: Utility function used by low-level drivers to report that * they have observed a bus reset on the bus being handled. * * Arguments: shost - Host in question * channel - channel on which reset was observed. * * Returns: Nothing * * Lock status: Host lock must be held. * * Notes: This only needs to be called if the reset is one which * originates from an unknown location. Resets originated * by the mid-level itself don't need to call this, but there * should be no harm. * * The main purpose of this is to make sure that a CHECK_CONDITION * is properly treated. */ void scsi_report_bus_reset(struct Scsi_Host *shost, int channel) { struct scsi_device *sdev; __shost_for_each_device(sdev, shost) { if (channel == sdev_channel(sdev)) __scsi_report_device_reset(sdev, NULL); } } EXPORT_SYMBOL(scsi_report_bus_reset); /* * Function: scsi_report_device_reset() * * Purpose: Utility function used by low-level drivers to report that * they have observed a device reset on the device being handled. * * Arguments: shost - Host in question * channel - channel on which reset was observed * target - target on which reset was observed * * Returns: Nothing * * Lock status: Host lock must be held * * Notes: This only needs to be called if the reset is one which * originates from an unknown location. Resets originated * by the mid-level itself don't need to call this, but there * should be no harm. * * The main purpose of this is to make sure that a CHECK_CONDITION * is properly treated. */ void scsi_report_device_reset(struct Scsi_Host *shost, int channel, int target) { struct scsi_device *sdev; __shost_for_each_device(sdev, shost) { if (channel == sdev_channel(sdev) && target == sdev_id(sdev)) __scsi_report_device_reset(sdev, NULL); } } EXPORT_SYMBOL(scsi_report_device_reset); /** * scsi_ioctl_reset: explicitly reset a host/bus/target/device * @dev: scsi_device to operate on * @arg: reset type (see sg.h) */ int scsi_ioctl_reset(struct scsi_device *dev, int __user *arg) { struct scsi_cmnd *scmd; struct Scsi_Host *shost = dev->host; struct request *rq; unsigned long flags; int error = 0, val; enum scsi_disposition rtn; if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RAWIO)) return -EACCES; error = get_user(val, arg); if (error) return error; if (scsi_autopm_get_host(shost) < 0) return -EIO; error = -EIO; rq = kzalloc(sizeof(struct request) + sizeof(struct scsi_cmnd) + shost->hostt->cmd_size, GFP_KERNEL); if (!rq) goto out_put_autopm_host; blk_rq_init(NULL, rq); scmd = (struct scsi_cmnd *)(rq + 1); scsi_init_command(dev, scmd); scmd->submitter = SUBMITTED_BY_SCSI_RESET_IOCTL; scmd->flags |= SCMD_LAST; memset(&scmd->sdb, 0, sizeof(scmd->sdb)); scmd->cmd_len = 0; scmd->sc_data_direction = DMA_BIDIRECTIONAL; spin_lock_irqsave(shost->host_lock, flags); shost->tmf_in_progress = 1; spin_unlock_irqrestore(shost->host_lock, flags); switch (val & ~SG_SCSI_RESET_NO_ESCALATE) { case SG_SCSI_RESET_NOTHING: rtn = SUCCESS; break; case SG_SCSI_RESET_DEVICE: rtn = scsi_try_bus_device_reset(scmd); if (rtn == SUCCESS || (val & SG_SCSI_RESET_NO_ESCALATE)) break; fallthrough; case SG_SCSI_RESET_TARGET: rtn = scsi_try_target_reset(scmd); if (rtn == SUCCESS || (val & SG_SCSI_RESET_NO_ESCALATE)) break; fallthrough; case SG_SCSI_RESET_BUS: rtn = scsi_try_bus_reset(scmd); if (rtn == SUCCESS || (val & SG_SCSI_RESET_NO_ESCALATE)) break; fallthrough; case SG_SCSI_RESET_HOST: rtn = scsi_try_host_reset(scmd); if (rtn == SUCCESS) break; fallthrough; default: rtn = FAILED; break; } error = (rtn == SUCCESS) ? 0 : -EIO; spin_lock_irqsave(shost->host_lock, flags); shost->tmf_in_progress = 0; spin_unlock_irqrestore(shost->host_lock, flags); /* * be sure to wake up anyone who was sleeping or had their queue * suspended while we performed the TMF. */ SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "waking up host to restart after TMF\n")); wake_up(&shost->host_wait); scsi_run_host_queues(shost); kfree(rq); out_put_autopm_host: scsi_autopm_put_host(shost); return error; } bool scsi_command_normalize_sense(const struct scsi_cmnd *cmd, struct scsi_sense_hdr *sshdr) { return scsi_normalize_sense(cmd->sense_buffer, SCSI_SENSE_BUFFERSIZE, sshdr); } EXPORT_SYMBOL(scsi_command_normalize_sense); /** * scsi_get_sense_info_fld - get information field from sense data (either fixed or descriptor format) * @sense_buffer: byte array of sense data * @sb_len: number of valid bytes in sense_buffer * @info_out: pointer to 64 integer where 8 or 4 byte information * field will be placed if found. * * Return value: * true if information field found, false if not found. */ bool scsi_get_sense_info_fld(const u8 *sense_buffer, int sb_len, u64 *info_out) { const u8 * ucp; if (sb_len < 7) return false; switch (sense_buffer[0] & 0x7f) { case 0x70: case 0x71: if (sense_buffer[0] & 0x80) { *info_out = get_unaligned_be32(&sense_buffer[3]); return true; } return false; case 0x72: case 0x73: ucp = scsi_sense_desc_find(sense_buffer, sb_len, 0 /* info desc */); if (ucp && (0xa == ucp[1])) { *info_out = get_unaligned_be64(&ucp[4]); return true; } return false; default: return false; } } EXPORT_SYMBOL(scsi_get_sense_info_fld); |
1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 | // SPDX-License-Identifier: GPL-2.0-only /* Support ct functions for openvswitch and used by OVS and TC conntrack. */ #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> #include <net/ipv6_frag.h> #include <net/ip.h> #include <linux/netfilter_ipv6.h> /* 'skb' should already be pulled to nh_ofs. */ int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, u16 proto) { const struct nf_conntrack_helper *helper; const struct nf_conn_help *help; unsigned int protoff; int err; if (ctinfo == IP_CT_RELATED_REPLY) return NF_ACCEPT; help = nfct_help(ct); if (!help) return NF_ACCEPT; helper = rcu_dereference(help->helper); if (!helper) return NF_ACCEPT; if (helper->tuple.src.l3num != NFPROTO_UNSPEC && helper->tuple.src.l3num != proto) return NF_ACCEPT; switch (proto) { case NFPROTO_IPV4: protoff = ip_hdrlen(skb); proto = ip_hdr(skb)->protocol; break; case NFPROTO_IPV6: { u8 nexthdr = ipv6_hdr(skb)->nexthdr; __be16 frag_off; int ofs; ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); if (ofs < 0 || (frag_off & htons(~0x7)) != 0) { pr_debug("proto header not found\n"); return NF_ACCEPT; } protoff = ofs; proto = nexthdr; break; } default: WARN_ONCE(1, "helper invoked on non-IP family!"); return NF_DROP; } if (helper->tuple.dst.protonum != proto) return NF_ACCEPT; err = helper->help(skb, protoff, ct, ctinfo); if (err != NF_ACCEPT) return err; /* Adjust seqs after helper. This is needed due to some helpers (e.g., * FTP with NAT) adusting the TCP payload size when mangling IP * addresses and/or port numbers in the text-based control connection. */ if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && !nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) return NF_DROP; return NF_ACCEPT; } EXPORT_SYMBOL_GPL(nf_ct_helper); int nf_ct_add_helper(struct nf_conn *ct, const char *name, u8 family, u8 proto, bool nat, struct nf_conntrack_helper **hp) { struct nf_conntrack_helper *helper; struct nf_conn_help *help; int ret = 0; helper = nf_conntrack_helper_try_module_get(name, family, proto); if (!helper) return -EINVAL; help = nf_ct_helper_ext_add(ct, GFP_KERNEL); if (!help) { nf_conntrack_helper_put(helper); return -ENOMEM; } #if IS_ENABLED(CONFIG_NF_NAT) if (nat) { ret = nf_nat_helper_try_module_get(name, family, proto); if (ret) { nf_conntrack_helper_put(helper); return ret; } } #endif rcu_assign_pointer(help->helper, helper); *hp = helper; return ret; } EXPORT_SYMBOL_GPL(nf_ct_add_helper); /* Trim the skb to the length specified by the IP/IPv6 header, * removing any trailing lower-layer padding. This prepares the skb * for higher-layer processing that assumes skb->len excludes padding * (such as nf_ip_checksum). The caller needs to pull the skb to the * network header, and ensure ip_hdr/ipv6_hdr points to valid data. */ int nf_ct_skb_network_trim(struct sk_buff *skb, int family) { unsigned int len; switch (family) { case NFPROTO_IPV4: len = skb_ip_totlen(skb); break; case NFPROTO_IPV6: len = ntohs(ipv6_hdr(skb)->payload_len); if (ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP) { int err = nf_ip6_check_hbh_len(skb, &len); if (err) return err; } len += sizeof(struct ipv6hdr); break; default: len = skb->len; } return pskb_trim_rcsum(skb, len); } EXPORT_SYMBOL_GPL(nf_ct_skb_network_trim); /* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero * value if 'skb' is freed. */ int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb, u16 zone, u8 family, u8 *proto, u16 *mru) { int err; if (family == NFPROTO_IPV4) { enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone; memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); local_bh_disable(); err = ip_defrag(net, skb, user); local_bh_enable(); if (err) return err; *mru = IPCB(skb)->frag_max_size; #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) } else if (family == NFPROTO_IPV6) { enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); err = nf_ct_frag6_gather(net, skb, user); if (err) { if (err != -EINPROGRESS) kfree_skb(skb); return err; } *proto = ipv6_hdr(skb)->nexthdr; *mru = IP6CB(skb)->frag_max_size; #endif } else { kfree_skb(skb); return -EPFNOSUPPORT; } skb_clear_hash(skb); skb->ignore_df = 1; return 0; } EXPORT_SYMBOL_GPL(nf_ct_handle_fragments); |
16 17 17 16 1 1 940 2 2 2 3 2 12 1 12 12 1 1 15 15 15 15 12 15 12 12 12 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 | // SPDX-License-Identifier: GPL-2.0 /* Copyright 2011-2014 Autronica Fire and Security AS * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * * Frame handler other utility functions for HSR and PRP. */ #include "hsr_slave.h" #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/if_vlan.h> #include "hsr_main.h" #include "hsr_device.h" #include "hsr_forward.h" #include "hsr_framereg.h" bool hsr_invalid_dan_ingress_frame(__be16 protocol) { return (protocol != htons(ETH_P_PRP) && protocol != htons(ETH_P_HSR)); } static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct hsr_port *port; struct hsr_priv *hsr; __be16 protocol; /* Packets from dev_loopback_xmit() do not have L2 header, bail out */ if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; if (!skb_mac_header_was_set(skb)) { WARN_ONCE(1, "%s: skb invalid", __func__); return RX_HANDLER_PASS; } port = hsr_port_get_rcu(skb->dev); if (!port) goto finish_pass; hsr = port->hsr; if (hsr_addr_is_self(port->hsr, eth_hdr(skb)->h_source)) { /* Directly kill frames sent by ourselves */ kfree_skb(skb); goto finish_consume; } /* For HSR, only tagged frames are expected (unless the device offloads * HSR tag removal), but for PRP there could be non tagged frames as * well from Single attached nodes (SANs). */ protocol = eth_hdr(skb)->h_proto; if (!(port->dev->features & NETIF_F_HW_HSR_TAG_RM) && hsr->proto_ops->invalid_dan_ingress_frame && hsr->proto_ops->invalid_dan_ingress_frame(protocol)) goto finish_pass; skb_push(skb, ETH_HLEN); skb_reset_mac_header(skb); if ((!hsr->prot_version && protocol == htons(ETH_P_PRP)) || protocol == htons(ETH_P_HSR)) skb_set_network_header(skb, ETH_HLEN + HSR_HLEN); skb_reset_mac_len(skb); hsr_forward_skb(skb, port); finish_consume: return RX_HANDLER_CONSUMED; finish_pass: return RX_HANDLER_PASS; } bool hsr_port_exists(const struct net_device *dev) { return rcu_access_pointer(dev->rx_handler) == hsr_handle_frame; } static int hsr_check_dev_ok(struct net_device *dev, struct netlink_ext_ack *extack) { /* Don't allow HSR on non-ethernet like devices */ if ((dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN) { NL_SET_ERR_MSG_MOD(extack, "Cannot use loopback or non-ethernet device as HSR slave."); return -EINVAL; } /* Don't allow enslaving hsr devices */ if (is_hsr_master(dev)) { NL_SET_ERR_MSG_MOD(extack, "Cannot create trees of HSR devices."); return -EINVAL; } if (hsr_port_exists(dev)) { NL_SET_ERR_MSG_MOD(extack, "This device is already a HSR slave."); return -EINVAL; } if (is_vlan_dev(dev)) { NL_SET_ERR_MSG_MOD(extack, "HSR on top of VLAN is not yet supported in this driver."); return -EINVAL; } if (dev->priv_flags & IFF_DONT_BRIDGE) { NL_SET_ERR_MSG_MOD(extack, "This device does not support bridging."); return -EOPNOTSUPP; } /* HSR over bonded devices has not been tested, but I'm not sure it * won't work... */ return 0; } /* Setup device to be added to the HSR bridge. */ static int hsr_portdev_setup(struct hsr_priv *hsr, struct net_device *dev, struct hsr_port *port, struct netlink_ext_ack *extack) { struct net_device *hsr_dev; struct hsr_port *master; int res; /* Don't use promiscuous mode for offload since L2 frame forward * happens at the offloaded hardware. */ if (!port->hsr->fwd_offloaded) { res = dev_set_promiscuity(dev, 1); if (res) return res; } master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); hsr_dev = master->dev; res = netdev_upper_dev_link(dev, hsr_dev, extack); if (res) goto fail_upper_dev_link; res = netdev_rx_handler_register(dev, hsr_handle_frame, port); if (res) goto fail_rx_handler; dev_disable_lro(dev); return 0; fail_rx_handler: netdev_upper_dev_unlink(dev, hsr_dev); fail_upper_dev_link: if (!port->hsr->fwd_offloaded) dev_set_promiscuity(dev, -1); return res; } int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev, enum hsr_port_type type, struct netlink_ext_ack *extack) { struct hsr_port *port, *master; int res; if (type != HSR_PT_MASTER) { res = hsr_check_dev_ok(dev, extack); if (res) return res; } port = hsr_port_get_hsr(hsr, type); if (port) return -EBUSY; /* This port already exists */ port = kzalloc(sizeof(*port), GFP_KERNEL); if (!port) return -ENOMEM; port->hsr = hsr; port->dev = dev; port->type = type; if (type != HSR_PT_MASTER) { res = hsr_portdev_setup(hsr, dev, port, extack); if (res) goto fail_dev_setup; } list_add_tail_rcu(&port->port_list, &hsr->ports); synchronize_rcu(); master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); netdev_update_features(master->dev); dev_set_mtu(master->dev, hsr_get_max_mtu(hsr)); return 0; fail_dev_setup: kfree(port); return res; } void hsr_del_port(struct hsr_port *port) { struct hsr_priv *hsr; struct hsr_port *master; hsr = port->hsr; master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); list_del_rcu(&port->port_list); if (port != master) { netdev_update_features(master->dev); dev_set_mtu(master->dev, hsr_get_max_mtu(hsr)); netdev_rx_handler_unregister(port->dev); dev_set_promiscuity(port->dev, -1); netdev_upper_dev_unlink(port->dev, master->dev); } synchronize_rcu(); kfree(port); } |
628 585 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Definitions related to Power Management Quality of Service (PM QoS). * * Copyright (C) 2020 Intel Corporation * * Authors: * Mark Gross <mgross@linux.intel.com> * Rafael J. Wysocki <rafael.j.wysocki@intel.com> */ #ifndef _LINUX_PM_QOS_H #define _LINUX_PM_QOS_H #include <linux/plist.h> #include <linux/notifier.h> #include <linux/device.h> enum pm_qos_flags_status { PM_QOS_FLAGS_UNDEFINED = -1, PM_QOS_FLAGS_NONE, PM_QOS_FLAGS_SOME, PM_QOS_FLAGS_ALL, }; #define PM_QOS_DEFAULT_VALUE (-1) #define PM_QOS_LATENCY_ANY S32_MAX #define PM_QOS_LATENCY_ANY_NS ((s64)PM_QOS_LATENCY_ANY * NSEC_PER_USEC) #define PM_QOS_CPU_LATENCY_DEFAULT_VALUE (2000 * USEC_PER_SEC) #define PM_QOS_RESUME_LATENCY_DEFAULT_VALUE PM_QOS_LATENCY_ANY #define PM_QOS_RESUME_LATENCY_NO_CONSTRAINT PM_QOS_LATENCY_ANY #define PM_QOS_RESUME_LATENCY_NO_CONSTRAINT_NS PM_QOS_LATENCY_ANY_NS #define PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE 0 #define PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE 0 #define PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE FREQ_QOS_MAX_DEFAULT_VALUE #define PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT (-1) #define PM_QOS_FLAG_NO_POWER_OFF (1 << 0) enum pm_qos_type { PM_QOS_UNITIALIZED, PM_QOS_MAX, /* return the largest value */ PM_QOS_MIN, /* return the smallest value */ }; /* * Note: The lockless read path depends on the CPU accessing target_value * or effective_flags atomically. Atomic access is only guaranteed on all CPU * types linux supports for 32 bit quantites */ struct pm_qos_constraints { struct plist_head list; s32 target_value; /* Do not change to 64 bit */ s32 default_value; s32 no_constraint_value; enum pm_qos_type type; struct blocking_notifier_head *notifiers; }; struct pm_qos_request { struct plist_node node; struct pm_qos_constraints *qos; }; struct pm_qos_flags_request { struct list_head node; s32 flags; /* Do not change to 64 bit */ }; struct pm_qos_flags { struct list_head list; s32 effective_flags; /* Do not change to 64 bit */ }; #define FREQ_QOS_MIN_DEFAULT_VALUE 0 #define FREQ_QOS_MAX_DEFAULT_VALUE S32_MAX enum freq_qos_req_type { FREQ_QOS_MIN = 1, FREQ_QOS_MAX, }; struct freq_constraints { struct pm_qos_constraints min_freq; struct blocking_notifier_head min_freq_notifiers; struct pm_qos_constraints max_freq; struct blocking_notifier_head max_freq_notifiers; }; struct freq_qos_request { enum freq_qos_req_type type; struct plist_node pnode; struct freq_constraints *qos; }; enum dev_pm_qos_req_type { DEV_PM_QOS_RESUME_LATENCY = 1, DEV_PM_QOS_LATENCY_TOLERANCE, DEV_PM_QOS_MIN_FREQUENCY, DEV_PM_QOS_MAX_FREQUENCY, DEV_PM_QOS_FLAGS, }; struct dev_pm_qos_request { enum dev_pm_qos_req_type type; union { struct plist_node pnode; struct pm_qos_flags_request flr; struct freq_qos_request freq; } data; struct device *dev; }; struct dev_pm_qos { struct pm_qos_constraints resume_latency; struct pm_qos_constraints latency_tolerance; struct freq_constraints freq; struct pm_qos_flags flags; struct dev_pm_qos_request *resume_latency_req; struct dev_pm_qos_request *latency_tolerance_req; struct dev_pm_qos_request *flags_req; }; /* Action requested to pm_qos_update_target */ enum pm_qos_req_action { PM_QOS_ADD_REQ, /* Add a new request */ PM_QOS_UPDATE_REQ, /* Update an existing request */ PM_QOS_REMOVE_REQ /* Remove an existing request */ }; static inline int dev_pm_qos_request_active(struct dev_pm_qos_request *req) { return req->dev != NULL; } s32 pm_qos_read_value(struct pm_qos_constraints *c); int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, enum pm_qos_req_action action, int value); bool pm_qos_update_flags(struct pm_qos_flags *pqf, struct pm_qos_flags_request *req, enum pm_qos_req_action action, s32 val); #ifdef CONFIG_CPU_IDLE s32 cpu_latency_qos_limit(void); bool cpu_latency_qos_request_active(struct pm_qos_request *req); void cpu_latency_qos_add_request(struct pm_qos_request *req, s32 value); void cpu_latency_qos_update_request(struct pm_qos_request *req, s32 new_value); void cpu_latency_qos_remove_request(struct pm_qos_request *req); #else static inline s32 cpu_latency_qos_limit(void) { return INT_MAX; } static inline bool cpu_latency_qos_request_active(struct pm_qos_request *req) { return false; } static inline void cpu_latency_qos_add_request(struct pm_qos_request *req, s32 value) {} static inline void cpu_latency_qos_update_request(struct pm_qos_request *req, s32 new_value) {} static inline void cpu_latency_qos_remove_request(struct pm_qos_request *req) {} #endif #ifdef CONFIG_PM enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask); enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask); s32 __dev_pm_qos_resume_latency(struct device *dev); s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type); int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value); int dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value); int dev_pm_qos_remove_request(struct dev_pm_qos_request *req); int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type); int dev_pm_qos_remove_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type); void dev_pm_qos_constraints_init(struct device *dev); void dev_pm_qos_constraints_destroy(struct device *dev); int dev_pm_qos_add_ancestor_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value); int dev_pm_qos_expose_latency_limit(struct device *dev, s32 value); void dev_pm_qos_hide_latency_limit(struct device *dev); int dev_pm_qos_expose_flags(struct device *dev, s32 value); void dev_pm_qos_hide_flags(struct device *dev); int dev_pm_qos_update_flags(struct device *dev, s32 mask, bool set); s32 dev_pm_qos_get_user_latency_tolerance(struct device *dev); int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val); int dev_pm_qos_expose_latency_tolerance(struct device *dev); void dev_pm_qos_hide_latency_tolerance(struct device *dev); static inline s32 dev_pm_qos_requested_resume_latency(struct device *dev) { return dev->power.qos->resume_latency_req->data.pnode.prio; } static inline s32 dev_pm_qos_requested_flags(struct device *dev) { return dev->power.qos->flags_req->data.flr.flags; } static inline s32 dev_pm_qos_raw_resume_latency(struct device *dev) { return IS_ERR_OR_NULL(dev->power.qos) ? PM_QOS_RESUME_LATENCY_NO_CONSTRAINT : pm_qos_read_value(&dev->power.qos->resume_latency); } #else static inline enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask) { return PM_QOS_FLAGS_UNDEFINED; } static inline enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask) { return PM_QOS_FLAGS_UNDEFINED; } static inline s32 __dev_pm_qos_resume_latency(struct device *dev) { return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; } static inline s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type) { switch (type) { case DEV_PM_QOS_RESUME_LATENCY: return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; case DEV_PM_QOS_MIN_FREQUENCY: return PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE; case DEV_PM_QOS_MAX_FREQUENCY: return PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE; default: WARN_ON(1); return 0; } } static inline int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { return 0; } static inline int dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value) { return 0; } static inline int dev_pm_qos_remove_request(struct dev_pm_qos_request *req) { return 0; } static inline int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type) { return 0; } static inline int dev_pm_qos_remove_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type) { return 0; } static inline void dev_pm_qos_constraints_init(struct device *dev) { dev->power.power_state = PMSG_ON; } static inline void dev_pm_qos_constraints_destroy(struct device *dev) { dev->power.power_state = PMSG_INVALID; } static inline int dev_pm_qos_add_ancestor_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { return 0; } static inline int dev_pm_qos_expose_latency_limit(struct device *dev, s32 value) { return 0; } static inline void dev_pm_qos_hide_latency_limit(struct device *dev) {} static inline int dev_pm_qos_expose_flags(struct device *dev, s32 value) { return 0; } static inline void dev_pm_qos_hide_flags(struct device *dev) {} static inline int dev_pm_qos_update_flags(struct device *dev, s32 m, bool set) { return 0; } static inline s32 dev_pm_qos_get_user_latency_tolerance(struct device *dev) { return PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT; } static inline int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val) { return 0; } static inline int dev_pm_qos_expose_latency_tolerance(struct device *dev) { return 0; } static inline void dev_pm_qos_hide_latency_tolerance(struct device *dev) {} static inline s32 dev_pm_qos_requested_resume_latency(struct device *dev) { return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; } static inline s32 dev_pm_qos_requested_flags(struct device *dev) { return 0; } static inline s32 dev_pm_qos_raw_resume_latency(struct device *dev) { return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; } #endif static inline int freq_qos_request_active(struct freq_qos_request *req) { return !IS_ERR_OR_NULL(req->qos); } void freq_constraints_init(struct freq_constraints *qos); s32 freq_qos_read_value(struct freq_constraints *qos, enum freq_qos_req_type type); int freq_qos_add_request(struct freq_constraints *qos, struct freq_qos_request *req, enum freq_qos_req_type type, s32 value); int freq_qos_update_request(struct freq_qos_request *req, s32 new_value); int freq_qos_remove_request(struct freq_qos_request *req); int freq_qos_apply(struct freq_qos_request *req, enum pm_qos_req_action action, s32 value); int freq_qos_add_notifier(struct freq_constraints *qos, enum freq_qos_req_type type, struct notifier_block *notifier); int freq_qos_remove_notifier(struct freq_constraints *qos, enum freq_qos_req_type type, struct notifier_block *notifier); #endif |
9435 2064 7732 32 32 31 1 32 162 3 140 135 21 7 6 5 141 2 136 136 25 23 2 1 82 127 554 7486 121 122 122 18 18 18 18 17 18 17 1 1 354 50 456 22 168 143 212 72 71 66 297 297 368 25 353 40 46 144 89 8398 14 7705 7744 7530 343 7233 7573 337 8479 5 4 7 4 4 8 11 10 2 4 3 2 3 7 1 6 5 5 3 2 5 5 5 2 3 14 1 6 2 5 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 | // SPDX-License-Identifier: GPL-2.0-only /* * Generic pidhash and scalable, time-bounded PID allocator * * (C) 2002-2003 Nadia Yvette Chambers, IBM * (C) 2004 Nadia Yvette Chambers, Oracle * (C) 2002-2004 Ingo Molnar, Red Hat * * pid-structures are backing objects for tasks sharing a given ID to chain * against. There is very little to them aside from hashing them and * parking tasks using given ID's on a list. * * The hash is always changed with the tasklist_lock write-acquired, * and the hash is only accessed with the tasklist_lock at least * read-acquired, so there's no additional SMP locking needed here. * * We have a list of bitmap pages, which bitmaps represent the PID space. * Allocating and freeing PIDs is completely lockless. The worst-case * allocation scenario when all but one out of 1 million PIDs possible are * allocated already: the scanning of 32 list entries and at most PAGE_SIZE * bytes. The typical fastpath is a single successful setbit. Freeing is O(1). * * Pid namespaces: * (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc. * (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM * Many thanks to Oleg Nesterov for comments and help * */ #include <linux/mm.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/rculist.h> #include <linux/memblock.h> #include <linux/pid_namespace.h> #include <linux/init_task.h> #include <linux/syscalls.h> #include <linux/proc_ns.h> #include <linux/refcount.h> #include <linux/anon_inodes.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/idr.h> #include <linux/pidfs.h> #include <net/sock.h> #include <uapi/linux/pidfd.h> struct pid init_struct_pid = { .count = REFCOUNT_INIT(1), .tasks = { { .first = NULL }, { .first = NULL }, { .first = NULL }, }, .level = 0, .numbers = { { .nr = 0, .ns = &init_pid_ns, }, } }; int pid_max = PID_MAX_DEFAULT; int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; /* * Pseudo filesystems start inode numbering after one. We use Reserved * PIDs as a natural offset. */ static u64 pidfs_ino = RESERVED_PIDS; /* * PID-map pages start out as NULL, they get allocated upon * first use and are never deallocated. This way a low pid_max * value does not cause lots of bitmaps to be allocated, but * the scheme scales to up to 4 million PIDs, runtime. */ struct pid_namespace init_pid_ns = { .ns.count = REFCOUNT_INIT(2), .idr = IDR_INIT(init_pid_ns.idr), .pid_allocated = PIDNS_ADDING, .level = 0, .child_reaper = &init_task, .user_ns = &init_user_ns, .ns.inum = PROC_PID_INIT_INO, #ifdef CONFIG_PID_NS .ns.ops = &pidns_operations, #endif #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC, #endif }; EXPORT_SYMBOL_GPL(init_pid_ns); /* * Note: disable interrupts while the pidmap_lock is held as an * interrupt might come in and do read_lock(&tasklist_lock). * * If we don't disable interrupts there is a nasty deadlock between * detach_pid()->free_pid() and another cpu that does * spin_lock(&pidmap_lock) followed by an interrupt routine that does * read_lock(&tasklist_lock); * * After we clean up the tasklist_lock and know there are no * irq handlers that take it we can leave the interrupts enabled. * For now it is easier to be safe than to prove it can't happen. */ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); void put_pid(struct pid *pid) { struct pid_namespace *ns; if (!pid) return; ns = pid->numbers[pid->level].ns; if (refcount_dec_and_test(&pid->count)) { kmem_cache_free(ns->pid_cachep, pid); put_pid_ns(ns); } } EXPORT_SYMBOL_GPL(put_pid); static void delayed_put_pid(struct rcu_head *rhp) { struct pid *pid = container_of(rhp, struct pid, rcu); put_pid(pid); } void free_pid(struct pid *pid) { /* We can be called with write_lock_irq(&tasklist_lock) held */ int i; unsigned long flags; spin_lock_irqsave(&pidmap_lock, flags); for (i = 0; i <= pid->level; i++) { struct upid *upid = pid->numbers + i; struct pid_namespace *ns = upid->ns; switch (--ns->pid_allocated) { case 2: case 1: /* When all that is left in the pid namespace * is the reaper wake up the reaper. The reaper * may be sleeping in zap_pid_ns_processes(). */ wake_up_process(ns->child_reaper); break; case PIDNS_ADDING: /* Handle a fork failure of the first process */ WARN_ON(ns->child_reaper); ns->pid_allocated = 0; break; } idr_remove(&ns->idr, upid->nr); } spin_unlock_irqrestore(&pidmap_lock, flags); call_rcu(&pid->rcu, delayed_put_pid); } struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, size_t set_tid_size) { struct pid *pid; enum pid_type type; int i, nr; struct pid_namespace *tmp; struct upid *upid; int retval = -ENOMEM; /* * set_tid_size contains the size of the set_tid array. Starting at * the most nested currently active PID namespace it tells alloc_pid() * which PID to set for a process in that most nested PID namespace * up to set_tid_size PID namespaces. It does not have to set the PID * for a process in all nested PID namespaces but set_tid_size must * never be greater than the current ns->level + 1. */ if (set_tid_size > ns->level + 1) return ERR_PTR(-EINVAL); pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); if (!pid) return ERR_PTR(retval); tmp = ns; pid->level = ns->level; for (i = ns->level; i >= 0; i--) { int tid = 0; if (set_tid_size) { tid = set_tid[ns->level - i]; retval = -EINVAL; if (tid < 1 || tid >= pid_max) goto out_free; /* * Also fail if a PID != 1 is requested and * no PID 1 exists. */ if (tid != 1 && !tmp->child_reaper) goto out_free; retval = -EPERM; if (!checkpoint_restore_ns_capable(tmp->user_ns)) goto out_free; set_tid_size--; } idr_preload(GFP_KERNEL); spin_lock_irq(&pidmap_lock); if (tid) { nr = idr_alloc(&tmp->idr, NULL, tid, tid + 1, GFP_ATOMIC); /* * If ENOSPC is returned it means that the PID is * alreay in use. Return EEXIST in that case. */ if (nr == -ENOSPC) nr = -EEXIST; } else { int pid_min = 1; /* * init really needs pid 1, but after reaching the * maximum wrap back to RESERVED_PIDS */ if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS) pid_min = RESERVED_PIDS; /* * Store a null pointer so find_pid_ns does not find * a partially initialized PID (see below). */ nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, pid_max, GFP_ATOMIC); } spin_unlock_irq(&pidmap_lock); idr_preload_end(); if (nr < 0) { retval = (nr == -ENOSPC) ? -EAGAIN : nr; goto out_free; } pid->numbers[i].nr = nr; pid->numbers[i].ns = tmp; tmp = tmp->parent; } /* * ENOMEM is not the most obvious choice especially for the case * where the child subreaper has already exited and the pid * namespace denies the creation of any new processes. But ENOMEM * is what we have exposed to userspace for a long time and it is * documented behavior for pid namespaces. So we can't easily * change it even if there were an error code better suited. */ retval = -ENOMEM; get_pid_ns(ns); refcount_set(&pid->count, 1); spin_lock_init(&pid->lock); for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); init_waitqueue_head(&pid->wait_pidfd); INIT_HLIST_HEAD(&pid->inodes); upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; pid->stashed = NULL; pid->ino = ++pidfs_ino; for ( ; upid >= pid->numbers; --upid) { /* Make the PID visible to find_pid_ns. */ idr_replace(&upid->ns->idr, pid, upid->nr); upid->ns->pid_allocated++; } spin_unlock_irq(&pidmap_lock); return pid; out_unlock: spin_unlock_irq(&pidmap_lock); put_pid_ns(ns); out_free: spin_lock_irq(&pidmap_lock); while (++i <= ns->level) { upid = pid->numbers + i; idr_remove(&upid->ns->idr, upid->nr); } /* On failure to allocate the first pid, reset the state */ if (ns->pid_allocated == PIDNS_ADDING) idr_set_cursor(&ns->idr, 0); spin_unlock_irq(&pidmap_lock); kmem_cache_free(ns->pid_cachep, pid); return ERR_PTR(retval); } void disable_pid_allocation(struct pid_namespace *ns) { spin_lock_irq(&pidmap_lock); ns->pid_allocated &= ~PIDNS_ADDING; spin_unlock_irq(&pidmap_lock); } struct pid *find_pid_ns(int nr, struct pid_namespace *ns) { return idr_find(&ns->idr, nr); } EXPORT_SYMBOL_GPL(find_pid_ns); struct pid *find_vpid(int nr) { return find_pid_ns(nr, task_active_pid_ns(current)); } EXPORT_SYMBOL_GPL(find_vpid); static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type) { return (type == PIDTYPE_PID) ? &task->thread_pid : &task->signal->pids[type]; } /* * attach_pid() must be called with the tasklist_lock write-held. */ void attach_pid(struct task_struct *task, enum pid_type type) { struct pid *pid = *task_pid_ptr(task, type); hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]); } static void __change_pid(struct task_struct *task, enum pid_type type, struct pid *new) { struct pid **pid_ptr = task_pid_ptr(task, type); struct pid *pid; int tmp; pid = *pid_ptr; hlist_del_rcu(&task->pid_links[type]); *pid_ptr = new; if (type == PIDTYPE_PID) { WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID)); wake_up_all(&pid->wait_pidfd); } for (tmp = PIDTYPE_MAX; --tmp >= 0; ) if (pid_has_task(pid, tmp)) return; free_pid(pid); } void detach_pid(struct task_struct *task, enum pid_type type) { __change_pid(task, type, NULL); } void change_pid(struct task_struct *task, enum pid_type type, struct pid *pid) { __change_pid(task, type, pid); attach_pid(task, type); } void exchange_tids(struct task_struct *left, struct task_struct *right) { struct pid *pid1 = left->thread_pid; struct pid *pid2 = right->thread_pid; struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID]; struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID]; /* Swap the single entry tid lists */ hlists_swap_heads_rcu(head1, head2); /* Swap the per task_struct pid */ rcu_assign_pointer(left->thread_pid, pid2); rcu_assign_pointer(right->thread_pid, pid1); /* Swap the cached value */ WRITE_ONCE(left->pid, pid_nr(pid2)); WRITE_ONCE(right->pid, pid_nr(pid1)); } /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type type) { WARN_ON_ONCE(type == PIDTYPE_PID); hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]); } struct task_struct *pid_task(struct pid *pid, enum pid_type type) { struct task_struct *result = NULL; if (pid) { struct hlist_node *first; first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), lockdep_tasklist_lock_is_held()); if (first) result = hlist_entry(first, struct task_struct, pid_links[(type)]); } return result; } EXPORT_SYMBOL(pid_task); /* * Must be called under rcu_read_lock(). */ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) { RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "find_task_by_pid_ns() needs rcu_read_lock() protection"); return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); } struct task_struct *find_task_by_vpid(pid_t vnr) { return find_task_by_pid_ns(vnr, task_active_pid_ns(current)); } struct task_struct *find_get_task_by_vpid(pid_t nr) { struct task_struct *task; rcu_read_lock(); task = find_task_by_vpid(nr); if (task) get_task_struct(task); rcu_read_unlock(); return task; } struct pid *get_task_pid(struct task_struct *task, enum pid_type type) { struct pid *pid; rcu_read_lock(); pid = get_pid(rcu_dereference(*task_pid_ptr(task, type))); rcu_read_unlock(); return pid; } EXPORT_SYMBOL_GPL(get_task_pid); struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) { struct task_struct *result; rcu_read_lock(); result = pid_task(pid, type); if (result) get_task_struct(result); rcu_read_unlock(); return result; } EXPORT_SYMBOL_GPL(get_pid_task); struct pid *find_get_pid(pid_t nr) { struct pid *pid; rcu_read_lock(); pid = get_pid(find_vpid(nr)); rcu_read_unlock(); return pid; } EXPORT_SYMBOL_GPL(find_get_pid); pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) { struct upid *upid; pid_t nr = 0; if (pid && ns->level <= pid->level) { upid = &pid->numbers[ns->level]; if (upid->ns == ns) nr = upid->nr; } return nr; } EXPORT_SYMBOL_GPL(pid_nr_ns); pid_t pid_vnr(struct pid *pid) { return pid_nr_ns(pid, task_active_pid_ns(current)); } EXPORT_SYMBOL_GPL(pid_vnr); pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns) { pid_t nr = 0; rcu_read_lock(); if (!ns) ns = task_active_pid_ns(current); nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); rcu_read_unlock(); return nr; } EXPORT_SYMBOL(__task_pid_nr_ns); struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) { return ns_of_pid(task_pid(tsk)); } EXPORT_SYMBOL_GPL(task_active_pid_ns); /* * Used by proc to find the first pid that is greater than or equal to nr. * * If there is a pid at nr this function is exactly the same as find_pid_ns. */ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) { return idr_get_next(&ns->idr, &nr); } EXPORT_SYMBOL_GPL(find_ge_pid); struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) { struct fd f; struct pid *pid; f = fdget(fd); if (!f.file) return ERR_PTR(-EBADF); pid = pidfd_pid(f.file); if (!IS_ERR(pid)) { get_pid(pid); *flags = f.file->f_flags; } fdput(f); return pid; } /** * pidfd_get_task() - Get the task associated with a pidfd * * @pidfd: pidfd for which to get the task * @flags: flags associated with this pidfd * * Return the task associated with @pidfd. The function takes a reference on * the returned task. The caller is responsible for releasing that reference. * * Return: On success, the task_struct associated with the pidfd. * On error, a negative errno number will be returned. */ struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags) { unsigned int f_flags; struct pid *pid; struct task_struct *task; pid = pidfd_get_pid(pidfd, &f_flags); if (IS_ERR(pid)) return ERR_CAST(pid); task = get_pid_task(pid, PIDTYPE_TGID); put_pid(pid); if (!task) return ERR_PTR(-ESRCH); *flags = f_flags; return task; } /** * pidfd_create() - Create a new pid file descriptor. * * @pid: struct pid that the pidfd will reference * @flags: flags to pass * * This creates a new pid file descriptor with the O_CLOEXEC flag set. * * Note, that this function can only be called after the fd table has * been unshared to avoid leaking the pidfd to the new process. * * This symbol should not be explicitly exported to loadable modules. * * Return: On success, a cloexec pidfd is returned. * On error, a negative errno number will be returned. */ static int pidfd_create(struct pid *pid, unsigned int flags) { int pidfd; struct file *pidfd_file; pidfd = pidfd_prepare(pid, flags, &pidfd_file); if (pidfd < 0) return pidfd; fd_install(pidfd, pidfd_file); return pidfd; } /** * sys_pidfd_open() - Open new pid file descriptor. * * @pid: pid for which to retrieve a pidfd * @flags: flags to pass * * This creates a new pid file descriptor with the O_CLOEXEC flag set for * the task identified by @pid. Without PIDFD_THREAD flag the target task * must be a thread-group leader. * * Return: On success, a cloexec pidfd is returned. * On error, a negative errno number will be returned. */ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) { int fd; struct pid *p; if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD)) return -EINVAL; if (pid <= 0) return -EINVAL; p = find_get_pid(pid); if (!p) return -ESRCH; fd = pidfd_create(p, flags); put_pid(p); return fd; } void __init pid_idr_init(void) { /* Verify no one has done anything silly: */ BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING); /* bump default and minimum pid_max based on number of cpus */ pid_max = min(pid_max_max, max_t(int, pid_max, PIDS_PER_CPU_DEFAULT * num_possible_cpus())); pid_max_min = max_t(int, pid_max_min, PIDS_PER_CPU_MIN * num_possible_cpus()); pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min); idr_init(&init_pid_ns.idr); init_pid_ns.pid_cachep = kmem_cache_create("pid", struct_size_t(struct pid, numbers, 1), __alignof__(struct pid), SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); } static struct file *__pidfd_fget(struct task_struct *task, int fd) { struct file *file; int ret; ret = down_read_killable(&task->signal->exec_update_lock); if (ret) return ERR_PTR(ret); if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS)) file = fget_task(task, fd); else file = ERR_PTR(-EPERM); up_read(&task->signal->exec_update_lock); if (!file) { /* * It is possible that the target thread is exiting; it can be * either: * 1. before exit_signals(), which gives a real fd * 2. before exit_files() takes the task_lock() gives a real fd * 3. after exit_files() releases task_lock(), ->files is NULL; * this has PF_EXITING, since it was set in exit_signals(), * __pidfd_fget() returns EBADF. * In case 3 we get EBADF, but that really means ESRCH, since * the task is currently exiting and has freed its files * struct, so we fix it up. */ if (task->flags & PF_EXITING) file = ERR_PTR(-ESRCH); else file = ERR_PTR(-EBADF); } return file; } static int pidfd_getfd(struct pid *pid, int fd) { struct task_struct *task; struct file *file; int ret; task = get_pid_task(pid, PIDTYPE_PID); if (!task) return -ESRCH; file = __pidfd_fget(task, fd); put_task_struct(task); if (IS_ERR(file)) return PTR_ERR(file); ret = receive_fd(file, NULL, O_CLOEXEC); fput(file); return ret; } /** * sys_pidfd_getfd() - Get a file descriptor from another process * * @pidfd: the pidfd file descriptor of the process * @fd: the file descriptor number to get * @flags: flags on how to get the fd (reserved) * * This syscall gets a copy of a file descriptor from another process * based on the pidfd, and file descriptor number. It requires that * the calling process has the ability to ptrace the process represented * by the pidfd. The process which is having its file descriptor copied * is otherwise unaffected. * * Return: On success, a cloexec file descriptor is returned. * On error, a negative errno number will be returned. */ SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd, unsigned int, flags) { struct pid *pid; struct fd f; int ret; /* flags is currently unused - make sure it's unset */ if (flags) return -EINVAL; f = fdget(pidfd); if (!f.file) return -EBADF; pid = pidfd_pid(f.file); if (IS_ERR(pid)) ret = PTR_ERR(pid); else ret = pidfd_getfd(pid, fd); fdput(f); return ret; } |
27 93 1 6 49 49 1 21 33 33 33 52 52 52 52 54 9 47 14 6 37 14 37 8 48 35 35 117 75 49 52 54 10 115 1111 1106 99 1112 113 44 3 43 44 2 44 52 52 52 2 52 46 39 18 54 18 15 5 2 9 5 2 38 2 2 38 30 3 18 12 7 71 72 16 57 85 35 74 9 72 74 1 2 73 3 3 2 1 2 64 62 1 63 60 3 63 63 6 53 3 37 28 16 16 16 14 2 3 5 10 13 9 2 8 8 8 8 3 6 8 8 8 4 4 4 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 | // SPDX-License-Identifier: GPL-2.0 /* * linux/mm/mlock.c * * (C) Copyright 1995 Linus Torvalds * (C) Copyright 2002 Christoph Hellwig */ #include <linux/capability.h> #include <linux/mman.h> #include <linux/mm.h> #include <linux/sched/user.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/pagemap.h> #include <linux/pagevec.h> #include <linux/pagewalk.h> #include <linux/mempolicy.h> #include <linux/syscalls.h> #include <linux/sched.h> #include <linux/export.h> #include <linux/rmap.h> #include <linux/mmzone.h> #include <linux/hugetlb.h> #include <linux/memcontrol.h> #include <linux/mm_inline.h> #include <linux/secretmem.h> #include "internal.h" struct mlock_fbatch { local_lock_t lock; struct folio_batch fbatch; }; static DEFINE_PER_CPU(struct mlock_fbatch, mlock_fbatch) = { .lock = INIT_LOCAL_LOCK(lock), }; bool can_do_mlock(void) { if (rlimit(RLIMIT_MEMLOCK) != 0) return true; if (capable(CAP_IPC_LOCK)) return true; return false; } EXPORT_SYMBOL(can_do_mlock); /* * Mlocked folios are marked with the PG_mlocked flag for efficient testing * in vmscan and, possibly, the fault path; and to support semi-accurate * statistics. * * An mlocked folio [folio_test_mlocked(folio)] is unevictable. As such, it * will be ostensibly placed on the LRU "unevictable" list (actually no such * list exists), rather than the [in]active lists. PG_unevictable is set to * indicate the unevictable state. */ static struct lruvec *__mlock_folio(struct folio *folio, struct lruvec *lruvec) { /* There is nothing more we can do while it's off LRU */ if (!folio_test_clear_lru(folio)) return lruvec; lruvec = folio_lruvec_relock_irq(folio, lruvec); if (unlikely(folio_evictable(folio))) { /* * This is a little surprising, but quite possible: PG_mlocked * must have got cleared already by another CPU. Could this * folio be unevictable? I'm not sure, but move it now if so. */ if (folio_test_unevictable(folio)) { lruvec_del_folio(lruvec, folio); folio_clear_unevictable(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(UNEVICTABLE_PGRESCUED, folio_nr_pages(folio)); } goto out; } if (folio_test_unevictable(folio)) { if (folio_test_mlocked(folio)) folio->mlock_count++; goto out; } lruvec_del_folio(lruvec, folio); folio_clear_active(folio); folio_set_unevictable(folio); folio->mlock_count = !!folio_test_mlocked(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio)); out: folio_set_lru(folio); return lruvec; } static struct lruvec *__mlock_new_folio(struct folio *folio, struct lruvec *lruvec) { VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); lruvec = folio_lruvec_relock_irq(folio, lruvec); /* As above, this is a little surprising, but possible */ if (unlikely(folio_evictable(folio))) goto out; folio_set_unevictable(folio); folio->mlock_count = !!folio_test_mlocked(folio); __count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio)); out: lruvec_add_folio(lruvec, folio); folio_set_lru(folio); return lruvec; } static struct lruvec *__munlock_folio(struct folio *folio, struct lruvec *lruvec) { int nr_pages = folio_nr_pages(folio); bool isolated = false; if (!folio_test_clear_lru(folio)) goto munlock; isolated = true; lruvec = folio_lruvec_relock_irq(folio, lruvec); if (folio_test_unevictable(folio)) { /* Then mlock_count is maintained, but might undercount */ if (folio->mlock_count) folio->mlock_count--; if (folio->mlock_count) goto out; } /* else assume that was the last mlock: reclaim will fix it if not */ munlock: if (folio_test_clear_mlocked(folio)) { __zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages); if (isolated || !folio_test_unevictable(folio)) __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); else __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); } /* folio_evictable() has to be checked *after* clearing Mlocked */ if (isolated && folio_test_unevictable(folio) && folio_evictable(folio)) { lruvec_del_folio(lruvec, folio); folio_clear_unevictable(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } out: if (isolated) folio_set_lru(folio); return lruvec; } /* * Flags held in the low bits of a struct folio pointer on the mlock_fbatch. */ #define LRU_FOLIO 0x1 #define NEW_FOLIO 0x2 static inline struct folio *mlock_lru(struct folio *folio) { return (struct folio *)((unsigned long)folio + LRU_FOLIO); } static inline struct folio *mlock_new(struct folio *folio) { return (struct folio *)((unsigned long)folio + NEW_FOLIO); } /* * mlock_folio_batch() is derived from folio_batch_move_lru(): perhaps that can * make use of such folio pointer flags in future, but for now just keep it for * mlock. We could use three separate folio batches instead, but one feels * better (munlocking a full folio batch does not need to drain mlocking folio * batches first). */ static void mlock_folio_batch(struct folio_batch *fbatch) { struct lruvec *lruvec = NULL; unsigned long mlock; struct folio *folio; int i; for (i = 0; i < folio_batch_count(fbatch); i++) { folio = fbatch->folios[i]; mlock = (unsigned long)folio & (LRU_FOLIO | NEW_FOLIO); folio = (struct folio *)((unsigned long)folio - mlock); fbatch->folios[i] = folio; if (mlock & LRU_FOLIO) lruvec = __mlock_folio(folio, lruvec); else if (mlock & NEW_FOLIO) lruvec = __mlock_new_folio(folio, lruvec); else lruvec = __munlock_folio(folio, lruvec); } if (lruvec) unlock_page_lruvec_irq(lruvec); folios_put(fbatch); } void mlock_drain_local(void) { struct folio_batch *fbatch; local_lock(&mlock_fbatch.lock); fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); if (folio_batch_count(fbatch)) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } void mlock_drain_remote(int cpu) { struct folio_batch *fbatch; WARN_ON_ONCE(cpu_online(cpu)); fbatch = &per_cpu(mlock_fbatch.fbatch, cpu); if (folio_batch_count(fbatch)) mlock_folio_batch(fbatch); } bool need_mlock_drain(int cpu) { return folio_batch_count(&per_cpu(mlock_fbatch.fbatch, cpu)); } /** * mlock_folio - mlock a folio already on (or temporarily off) LRU * @folio: folio to be mlocked. */ void mlock_folio(struct folio *folio) { struct folio_batch *fbatch; local_lock(&mlock_fbatch.lock); fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); if (!folio_test_set_mlocked(folio)) { int nr_pages = folio_nr_pages(folio); zone_stat_mod_folio(folio, NR_MLOCK, nr_pages); __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); } folio_get(folio); if (!folio_batch_add(fbatch, mlock_lru(folio)) || folio_test_large(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } /** * mlock_new_folio - mlock a newly allocated folio not yet on LRU * @folio: folio to be mlocked, either normal or a THP head. */ void mlock_new_folio(struct folio *folio) { struct folio_batch *fbatch; int nr_pages = folio_nr_pages(folio); local_lock(&mlock_fbatch.lock); fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); folio_set_mlocked(folio); zone_stat_mod_folio(folio, NR_MLOCK, nr_pages); __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); folio_get(folio); if (!folio_batch_add(fbatch, mlock_new(folio)) || folio_test_large(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } /** * munlock_folio - munlock a folio * @folio: folio to be munlocked, either normal or a THP head. */ void munlock_folio(struct folio *folio) { struct folio_batch *fbatch; local_lock(&mlock_fbatch.lock); fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); /* * folio_test_clear_mlocked(folio) must be left to __munlock_folio(), * which will check whether the folio is multiply mlocked. */ folio_get(folio); if (!folio_batch_add(fbatch, folio) || folio_test_large(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } static inline unsigned int folio_mlock_step(struct folio *folio, pte_t *pte, unsigned long addr, unsigned long end) { unsigned int count, i, nr = folio_nr_pages(folio); unsigned long pfn = folio_pfn(folio); pte_t ptent = ptep_get(pte); if (!folio_test_large(folio)) return 1; count = pfn + nr - pte_pfn(ptent); count = min_t(unsigned int, count, (end - addr) >> PAGE_SHIFT); for (i = 0; i < count; i++, pte++) { pte_t entry = ptep_get(pte); if (!pte_present(entry)) break; if (pte_pfn(entry) - pfn >= nr) break; } return i; } static inline bool allow_mlock_munlock(struct folio *folio, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned int step) { /* * For unlock, allow munlock large folio which is partially * mapped to VMA. As it's possible that large folio is * mlocked and VMA is split later. * * During memory pressure, such kind of large folio can * be split. And the pages are not in VM_LOCKed VMA * can be reclaimed. */ if (!(vma->vm_flags & VM_LOCKED)) return true; /* folio_within_range() cannot take KSM, but any small folio is OK */ if (!folio_test_large(folio)) return true; /* folio not in range [start, end), skip mlock */ if (!folio_within_range(folio, vma, start, end)) return false; /* folio is not fully mapped, skip mlock */ if (step != folio_nr_pages(folio)) return false; return true; } static int mlock_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; pte_t *start_pte, *pte; pte_t ptent; struct folio *folio; unsigned int step = 1; unsigned long start = addr; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { if (!pmd_present(*pmd)) goto out; if (is_huge_zero_pmd(*pmd)) goto out; folio = page_folio(pmd_page(*pmd)); if (vma->vm_flags & VM_LOCKED) mlock_folio(folio); else munlock_folio(folio); goto out; } start_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!start_pte) { walk->action = ACTION_AGAIN; return 0; } for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) { ptent = ptep_get(pte); if (!pte_present(ptent)) continue; folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; step = folio_mlock_step(folio, pte, addr, end); if (!allow_mlock_munlock(folio, vma, start, end, step)) goto next_entry; if (vma->vm_flags & VM_LOCKED) mlock_folio(folio); else munlock_folio(folio); next_entry: pte += step - 1; addr += (step - 1) << PAGE_SHIFT; } pte_unmap(start_pte); out: spin_unlock(ptl); cond_resched(); return 0; } /* * mlock_vma_pages_range() - mlock any pages already in the range, * or munlock all pages in the range. * @vma - vma containing range to be mlock()ed or munlock()ed * @start - start address in @vma of the range * @end - end of range in @vma * @newflags - the new set of flags for @vma. * * Called for mlock(), mlock2() and mlockall(), to set @vma VM_LOCKED; * called for munlock() and munlockall(), to clear VM_LOCKED from @vma. */ static void mlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, vm_flags_t newflags) { static const struct mm_walk_ops mlock_walk_ops = { .pmd_entry = mlock_pte_range, .walk_lock = PGWALK_WRLOCK_VERIFY, }; /* * There is a slight chance that concurrent page migration, * or page reclaim finding a page of this now-VM_LOCKED vma, * will call mlock_vma_folio() and raise page's mlock_count: * double counting, leaving the page unevictable indefinitely. * Communicate this danger to mlock_vma_folio() with VM_IO, * which is a VM_SPECIAL flag not allowed on VM_LOCKED vmas. * mmap_lock is held in write mode here, so this weird * combination should not be visible to other mmap_lock users; * but WRITE_ONCE so rmap walkers must see VM_IO if VM_LOCKED. */ if (newflags & VM_LOCKED) newflags |= VM_IO; vma_start_write(vma); vm_flags_reset_once(vma, newflags); lru_add_drain(); walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL); lru_add_drain(); if (newflags & VM_IO) { newflags &= ~VM_IO; vm_flags_reset_once(vma, newflags); } } /* * mlock_fixup - handle mlock[all]/munlock[all] requests. * * Filters out "special" vmas -- VM_LOCKED never gets set for these, and * munlock is a no-op. However, for some special vmas, we go ahead and * populate the ptes. * * For vmas that pass the filters, merge/split as appropriate. */ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, vm_flags_t newflags) { struct mm_struct *mm = vma->vm_mm; int nr_pages; int ret = 0; vm_flags_t oldflags = vma->vm_flags; if (newflags == oldflags || (oldflags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) || vma_is_dax(vma) || vma_is_secretmem(vma)) /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ goto out; vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto out; } /* * Keep track of amount of locked VM. */ nr_pages = (end - start) >> PAGE_SHIFT; if (!(newflags & VM_LOCKED)) nr_pages = -nr_pages; else if (oldflags & VM_LOCKED) nr_pages = 0; mm->locked_vm += nr_pages; /* * vm_flags is protected by the mmap_lock held in write mode. * It's okay if try_to_unmap_one unmaps a page just after we * set VM_LOCKED, populate_vma_page_range will bring it back. */ if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) { /* No work to do, and mlocking twice would be wrong */ vma_start_write(vma); vm_flags_reset(vma, newflags); } else { mlock_vma_pages_range(vma, start, end, newflags); } out: *prev = vma; return ret; } static int apply_vma_lock_flags(unsigned long start, size_t len, vm_flags_t flags) { unsigned long nstart, end, tmp; struct vm_area_struct *vma, *prev; VMA_ITERATOR(vmi, current->mm, start); VM_BUG_ON(offset_in_page(start)); VM_BUG_ON(len != PAGE_ALIGN(len)); end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; vma = vma_iter_load(&vmi); if (!vma) return -ENOMEM; prev = vma_prev(&vmi); if (start > vma->vm_start) prev = vma; nstart = start; tmp = vma->vm_start; for_each_vma_range(vmi, vma, end) { int error; vm_flags_t newflags; if (vma->vm_start != tmp) return -ENOMEM; newflags = vma->vm_flags & ~VM_LOCKED_MASK; newflags |= flags; /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ tmp = vma->vm_end; if (tmp > end) tmp = end; error = mlock_fixup(&vmi, vma, &prev, nstart, tmp, newflags); if (error) return error; tmp = vma_iter_end(&vmi); nstart = tmp; } if (tmp < end) return -ENOMEM; return 0; } /* * Go through vma areas and sum size of mlocked * vma pages, as return value. * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT) * is also counted. * Return value: previously mlocked page counts */ static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm, unsigned long start, size_t len) { struct vm_area_struct *vma; unsigned long count = 0; unsigned long end; VMA_ITERATOR(vmi, mm, start); /* Don't overflow past ULONG_MAX */ if (unlikely(ULONG_MAX - len < start)) end = ULONG_MAX; else end = start + len; for_each_vma_range(vmi, vma, end) { if (vma->vm_flags & VM_LOCKED) { if (start > vma->vm_start) count -= (start - vma->vm_start); if (end < vma->vm_end) { count += end - vma->vm_start; break; } count += vma->vm_end - vma->vm_start; } } return count >> PAGE_SHIFT; } /* * convert get_user_pages() return value to posix mlock() error */ static int __mlock_posix_error_return(long retval) { if (retval == -EFAULT) retval = -ENOMEM; else if (retval == -ENOMEM) retval = -EAGAIN; return retval; } static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags) { unsigned long locked; unsigned long lock_limit; int error = -ENOMEM; start = untagged_addr(start); if (!can_do_mlock()) return -EPERM; len = PAGE_ALIGN(len + (offset_in_page(start))); start &= PAGE_MASK; lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; locked = len >> PAGE_SHIFT; if (mmap_write_lock_killable(current->mm)) return -EINTR; locked += current->mm->locked_vm; if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) { /* * It is possible that the regions requested intersect with * previously mlocked areas, that part area in "mm->locked_vm" * should not be counted to new mlock increment count. So check * and adjust locked count if necessary. */ locked -= count_mm_mlocked_page_nr(current->mm, start, len); } /* check against resource limits */ if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) error = apply_vma_lock_flags(start, len, flags); mmap_write_unlock(current->mm); if (error) return error; error = __mm_populate(start, len, 0); if (error) return __mlock_posix_error_return(error); return 0; } SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) { return do_mlock(start, len, VM_LOCKED); } SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags) { vm_flags_t vm_flags = VM_LOCKED; if (flags & ~MLOCK_ONFAULT) return -EINVAL; if (flags & MLOCK_ONFAULT) vm_flags |= VM_LOCKONFAULT; return do_mlock(start, len, vm_flags); } SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) { int ret; start = untagged_addr(start); len = PAGE_ALIGN(len + (offset_in_page(start))); start &= PAGE_MASK; if (mmap_write_lock_killable(current->mm)) return -EINTR; ret = apply_vma_lock_flags(start, len, 0); mmap_write_unlock(current->mm); return ret; } /* * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall) * and translate into the appropriate modifications to mm->def_flags and/or the * flags for all current VMAs. * * There are a couple of subtleties with this. If mlockall() is called multiple * times with different flags, the values do not necessarily stack. If mlockall * is called once including the MCL_FUTURE flag and then a second time without * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags. */ static int apply_mlockall_flags(int flags) { VMA_ITERATOR(vmi, current->mm, 0); struct vm_area_struct *vma, *prev = NULL; vm_flags_t to_add = 0; current->mm->def_flags &= ~VM_LOCKED_MASK; if (flags & MCL_FUTURE) { current->mm->def_flags |= VM_LOCKED; if (flags & MCL_ONFAULT) current->mm->def_flags |= VM_LOCKONFAULT; if (!(flags & MCL_CURRENT)) goto out; } if (flags & MCL_CURRENT) { to_add |= VM_LOCKED; if (flags & MCL_ONFAULT) to_add |= VM_LOCKONFAULT; } for_each_vma(vmi, vma) { vm_flags_t newflags; newflags = vma->vm_flags & ~VM_LOCKED_MASK; newflags |= to_add; /* Ignore errors */ mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end, newflags); cond_resched(); } out: return 0; } SYSCALL_DEFINE1(mlockall, int, flags) { unsigned long lock_limit; int ret; if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) || flags == MCL_ONFAULT) return -EINVAL; if (!can_do_mlock()) return -EPERM; lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; if (mmap_write_lock_killable(current->mm)) return -EINTR; ret = -ENOMEM; if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || capable(CAP_IPC_LOCK)) ret = apply_mlockall_flags(flags); mmap_write_unlock(current->mm); if (!ret && (flags & MCL_CURRENT)) mm_populate(0, TASK_SIZE); return ret; } SYSCALL_DEFINE0(munlockall) { int ret; if (mmap_write_lock_killable(current->mm)) return -EINTR; ret = apply_mlockall_flags(0); mmap_write_unlock(current->mm); return ret; } /* * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB * shm segments) get accounted against the user_struct instead. */ static DEFINE_SPINLOCK(shmlock_user_lock); int user_shm_lock(size_t size, struct ucounts *ucounts) { unsigned long lock_limit, locked; long memlock; int allowed = 0; locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; lock_limit = rlimit(RLIMIT_MEMLOCK); if (lock_limit != RLIM_INFINITY) lock_limit >>= PAGE_SHIFT; spin_lock(&shmlock_user_lock); memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); if ((memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) { dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); goto out; } if (!get_ucounts(ucounts)) { dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); allowed = 0; goto out; } allowed = 1; out: spin_unlock(&shmlock_user_lock); return allowed; } void user_shm_unlock(size_t size, struct ucounts *ucounts) { spin_lock(&shmlock_user_lock); dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT); spin_unlock(&shmlock_user_lock); put_ucounts(ucounts); } |
668 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Dynamic loading of modules into the kernel. * * Rewritten by Richard Henderson <rth@tamu.edu> Dec 1996 * Rewritten again by Rusty Russell, 2002 */ #ifndef _LINUX_MODULE_H #define _LINUX_MODULE_H #include <linux/list.h> #include <linux/stat.h> #include <linux/buildid.h> #include <linux/compiler.h> #include <linux/cache.h> #include <linux/kmod.h> #include <linux/init.h> #include <linux/elf.h> #include <linux/stringify.h> #include <linux/kobject.h> #include <linux/moduleparam.h> #include <linux/jump_label.h> #include <linux/export.h> #include <linux/rbtree_latch.h> #include <linux/error-injection.h> #include <linux/tracepoint-defs.h> #include <linux/srcu.h> #include <linux/static_call_types.h> #include <linux/dynamic_debug.h> #include <linux/percpu.h> #include <asm/module.h> #define MODULE_NAME_LEN MAX_PARAM_PREFIX_LEN struct modversion_info { unsigned long crc; char name[MODULE_NAME_LEN]; }; struct module; struct exception_table_entry; struct module_kobject { struct kobject kobj; struct module *mod; struct kobject *drivers_dir; struct module_param_attrs *mp; struct completion *kobj_completion; } __randomize_layout; struct module_attribute { struct attribute attr; ssize_t (*show)(struct module_attribute *, struct module_kobject *, char *); ssize_t (*store)(struct module_attribute *, struct module_kobject *, const char *, size_t count); void (*setup)(struct module *, const char *); int (*test)(struct module *); void (*free)(struct module *); }; struct module_version_attribute { struct module_attribute mattr; const char *module_name; const char *version; }; extern ssize_t __modver_version_show(struct module_attribute *, struct module_kobject *, char *); extern struct module_attribute module_uevent; /* These are either module local, or the kernel's dummy ones. */ extern int init_module(void); extern void cleanup_module(void); #ifndef MODULE /** * module_init() - driver initialization entry point * @x: function to be run at kernel boot time or module insertion * * module_init() will either be called during do_initcalls() (if * builtin) or at module insertion time (if a module). There can only * be one per module. */ #define module_init(x) __initcall(x); /** * module_exit() - driver exit entry point * @x: function to be run when driver is removed * * module_exit() will wrap the driver clean-up code * with cleanup_module() when used with rmmod when * the driver is a module. If the driver is statically * compiled into the kernel, module_exit() has no effect. * There can only be one per module. */ #define module_exit(x) __exitcall(x); #else /* MODULE */ /* * In most cases loadable modules do not need custom * initcall levels. There are still some valid cases where * a driver may be needed early if built in, and does not * matter when built as a loadable module. Like bus * snooping debug drivers. */ #define early_initcall(fn) module_init(fn) #define core_initcall(fn) module_init(fn) #define core_initcall_sync(fn) module_init(fn) #define postcore_initcall(fn) module_init(fn) #define postcore_initcall_sync(fn) module_init(fn) #define arch_initcall(fn) module_init(fn) #define subsys_initcall(fn) module_init(fn) #define subsys_initcall_sync(fn) module_init(fn) #define fs_initcall(fn) module_init(fn) #define fs_initcall_sync(fn) module_init(fn) #define rootfs_initcall(fn) module_init(fn) #define device_initcall(fn) module_init(fn) #define device_initcall_sync(fn) module_init(fn) #define late_initcall(fn) module_init(fn) #define late_initcall_sync(fn) module_init(fn) #define console_initcall(fn) module_init(fn) /* Each module must use one module_init(). */ #define module_init(initfn) \ static inline initcall_t __maybe_unused __inittest(void) \ { return initfn; } \ int init_module(void) __copy(initfn) \ __attribute__((alias(#initfn))); \ ___ADDRESSABLE(init_module, __initdata); /* This is only required if you want to be unloadable. */ #define module_exit(exitfn) \ static inline exitcall_t __maybe_unused __exittest(void) \ { return exitfn; } \ void cleanup_module(void) __copy(exitfn) \ __attribute__((alias(#exitfn))); \ ___ADDRESSABLE(cleanup_module, __exitdata); #endif /* This means "can be init if no module support, otherwise module load may call it." */ #ifdef CONFIG_MODULES #define __init_or_module #define __initdata_or_module #define __initconst_or_module #define __INIT_OR_MODULE .text #define __INITDATA_OR_MODULE .data #define __INITRODATA_OR_MODULE .section ".rodata","a",%progbits #else #define __init_or_module __init #define __initdata_or_module __initdata #define __initconst_or_module __initconst #define __INIT_OR_MODULE __INIT #define __INITDATA_OR_MODULE __INITDATA #define __INITRODATA_OR_MODULE __INITRODATA #endif /*CONFIG_MODULES*/ /* Generic info of form tag = "info" */ #define MODULE_INFO(tag, info) __MODULE_INFO(tag, tag, info) /* For userspace: you can also call me... */ #define MODULE_ALIAS(_alias) MODULE_INFO(alias, _alias) /* Soft module dependencies. See man modprobe.d for details. * Example: MODULE_SOFTDEP("pre: module-foo module-bar post: module-baz") */ #define MODULE_SOFTDEP(_softdep) MODULE_INFO(softdep, _softdep) /* * MODULE_FILE is used for generating modules.builtin * So, make it no-op when this is being built as a module */ #ifdef MODULE #define MODULE_FILE #else #define MODULE_FILE MODULE_INFO(file, KBUILD_MODFILE); #endif /* * The following license idents are currently accepted as indicating free * software modules * * "GPL" [GNU Public License v2] * "GPL v2" [GNU Public License v2] * "GPL and additional rights" [GNU Public License v2 rights and more] * "Dual BSD/GPL" [GNU Public License v2 * or BSD license choice] * "Dual MIT/GPL" [GNU Public License v2 * or MIT license choice] * "Dual MPL/GPL" [GNU Public License v2 * or Mozilla license choice] * * The following other idents are available * * "Proprietary" [Non free products] * * Both "GPL v2" and "GPL" (the latter also in dual licensed strings) are * merely stating that the module is licensed under the GPL v2, but are not * telling whether "GPL v2 only" or "GPL v2 or later". The reason why there * are two variants is a historic and failed attempt to convey more * information in the MODULE_LICENSE string. For module loading the * "only/or later" distinction is completely irrelevant and does neither * replace the proper license identifiers in the corresponding source file * nor amends them in any way. The sole purpose is to make the * 'Proprietary' flagging work and to refuse to bind symbols which are * exported with EXPORT_SYMBOL_GPL when a non free module is loaded. * * In the same way "BSD" is not a clear license information. It merely * states, that the module is licensed under one of the compatible BSD * license variants. The detailed and correct license information is again * to be found in the corresponding source files. * * There are dual licensed components, but when running with Linux it is the * GPL that is relevant so this is a non issue. Similarly LGPL linked with GPL * is a GPL combined work. * * This exists for several reasons * 1. So modinfo can show license info for users wanting to vet their setup * is free * 2. So the community can ignore bug reports including proprietary modules * 3. So vendors can do likewise based on their own policies */ #define MODULE_LICENSE(_license) MODULE_FILE MODULE_INFO(license, _license) /* * Author(s), use "Name <email>" or just "Name", for multiple * authors use multiple MODULE_AUTHOR() statements/lines. */ #define MODULE_AUTHOR(_author) MODULE_INFO(author, _author) /* What your module does. */ #define MODULE_DESCRIPTION(_description) MODULE_INFO(description, _description) #ifdef MODULE /* Creates an alias so file2alias.c can find device table. */ #define MODULE_DEVICE_TABLE(type, name) \ extern typeof(name) __mod_##type##__##name##_device_table \ __attribute__ ((unused, alias(__stringify(name)))) #else /* !MODULE */ #define MODULE_DEVICE_TABLE(type, name) #endif /* Version of form [<epoch>:]<version>[-<extra-version>]. * Or for CVS/RCS ID version, everything but the number is stripped. * <epoch>: A (small) unsigned integer which allows you to start versions * anew. If not mentioned, it's zero. eg. "2:1.0" is after * "1:2.0". * <version>: The <version> may contain only alphanumerics and the * character `.'. Ordered by numeric sort for numeric parts, * ascii sort for ascii parts (as per RPM or DEB algorithm). * <extraversion>: Like <version>, but inserted for local * customizations, eg "rh3" or "rusty1". * Using this automatically adds a checksum of the .c files and the * local headers in "srcversion". */ #if defined(MODULE) || !defined(CONFIG_SYSFS) #define MODULE_VERSION(_version) MODULE_INFO(version, _version) #else #define MODULE_VERSION(_version) \ MODULE_INFO(version, _version); \ static struct module_version_attribute __modver_attr \ __used __section("__modver") \ __aligned(__alignof__(struct module_version_attribute)) \ = { \ .mattr = { \ .attr = { \ .name = "version", \ .mode = S_IRUGO, \ }, \ .show = __modver_version_show, \ }, \ .module_name = KBUILD_MODNAME, \ .version = _version, \ } #endif /* Optional firmware file (or files) needed by the module * format is simply firmware file name. Multiple firmware * files require multiple MODULE_FIRMWARE() specifiers */ #define MODULE_FIRMWARE(_firmware) MODULE_INFO(firmware, _firmware) #define MODULE_IMPORT_NS(ns) MODULE_INFO(import_ns, __stringify(ns)) struct notifier_block; #ifdef CONFIG_MODULES extern int modules_disabled; /* for sysctl */ /* Get/put a kernel symbol (calls must be symmetric) */ void *__symbol_get(const char *symbol); void *__symbol_get_gpl(const char *symbol); #define symbol_get(x) ((typeof(&x))(__symbol_get(__stringify(x)))) /* modules using other modules: kdb wants to see this. */ struct module_use { struct list_head source_list; struct list_head target_list; struct module *source, *target; }; enum module_state { MODULE_STATE_LIVE, /* Normal state. */ MODULE_STATE_COMING, /* Full formed, running module_init. */ MODULE_STATE_GOING, /* Going away. */ MODULE_STATE_UNFORMED, /* Still setting it up. */ }; struct mod_tree_node { struct module *mod; struct latch_tree_node node; }; enum mod_mem_type { MOD_TEXT = 0, MOD_DATA, MOD_RODATA, MOD_RO_AFTER_INIT, MOD_INIT_TEXT, MOD_INIT_DATA, MOD_INIT_RODATA, MOD_MEM_NUM_TYPES, MOD_INVALID = -1, }; #define mod_mem_type_is_init(type) \ ((type) == MOD_INIT_TEXT || \ (type) == MOD_INIT_DATA || \ (type) == MOD_INIT_RODATA) #define mod_mem_type_is_core(type) (!mod_mem_type_is_init(type)) #define mod_mem_type_is_text(type) \ ((type) == MOD_TEXT || \ (type) == MOD_INIT_TEXT) #define mod_mem_type_is_data(type) (!mod_mem_type_is_text(type)) #define mod_mem_type_is_core_data(type) \ (mod_mem_type_is_core(type) && \ mod_mem_type_is_data(type)) #define for_each_mod_mem_type(type) \ for (enum mod_mem_type (type) = 0; \ (type) < MOD_MEM_NUM_TYPES; (type)++) #define for_class_mod_mem_type(type, class) \ for_each_mod_mem_type(type) \ if (mod_mem_type_is_##class(type)) struct module_memory { void *base; unsigned int size; #ifdef CONFIG_MODULES_TREE_LOOKUP struct mod_tree_node mtn; #endif }; #ifdef CONFIG_MODULES_TREE_LOOKUP /* Only touch one cacheline for common rbtree-for-core-layout case. */ #define __module_memory_align ____cacheline_aligned #else #define __module_memory_align #endif struct mod_kallsyms { Elf_Sym *symtab; unsigned int num_symtab; char *strtab; char *typetab; }; #ifdef CONFIG_LIVEPATCH /** * struct klp_modinfo - ELF information preserved from the livepatch module * * @hdr: ELF header * @sechdrs: Section header table * @secstrings: String table for the section headers * @symndx: The symbol table section index */ struct klp_modinfo { Elf_Ehdr hdr; Elf_Shdr *sechdrs; char *secstrings; unsigned int symndx; }; #endif struct module { enum module_state state; /* Member of list of modules */ struct list_head list; /* Unique handle for this module */ char name[MODULE_NAME_LEN]; #ifdef CONFIG_STACKTRACE_BUILD_ID /* Module build ID */ unsigned char build_id[BUILD_ID_SIZE_MAX]; #endif /* Sysfs stuff. */ struct module_kobject mkobj; struct module_attribute *modinfo_attrs; const char *version; const char *srcversion; struct kobject *holders_dir; /* Exported symbols */ const struct kernel_symbol *syms; const s32 *crcs; unsigned int num_syms; #ifdef CONFIG_ARCH_USES_CFI_TRAPS s32 *kcfi_traps; s32 *kcfi_traps_end; #endif /* Kernel parameters. */ #ifdef CONFIG_SYSFS struct mutex param_lock; #endif struct kernel_param *kp; unsigned int num_kp; /* GPL-only exported symbols. */ unsigned int num_gpl_syms; const struct kernel_symbol *gpl_syms; const s32 *gpl_crcs; bool using_gplonly_symbols; #ifdef CONFIG_MODULE_SIG /* Signature was verified. */ bool sig_ok; #endif bool async_probe_requested; /* Exception table */ unsigned int num_exentries; struct exception_table_entry *extable; /* Startup function. */ int (*init)(void); struct module_memory mem[MOD_MEM_NUM_TYPES] __module_memory_align; /* Arch-specific module values */ struct mod_arch_specific arch; unsigned long taints; /* same bits as kernel:taint_flags */ #ifdef CONFIG_GENERIC_BUG /* Support for BUG */ unsigned num_bugs; struct list_head bug_list; struct bug_entry *bug_table; #endif #ifdef CONFIG_KALLSYMS /* Protected by RCU and/or module_mutex: use rcu_dereference() */ struct mod_kallsyms __rcu *kallsyms; struct mod_kallsyms core_kallsyms; /* Section attributes */ struct module_sect_attrs *sect_attrs; /* Notes attributes */ struct module_notes_attrs *notes_attrs; #endif /* The command line arguments (may be mangled). People like keeping pointers to this stuff */ char *args; #ifdef CONFIG_SMP /* Per-cpu data. */ void __percpu *percpu; unsigned int percpu_size; #endif void *noinstr_text_start; unsigned int noinstr_text_size; #ifdef CONFIG_TRACEPOINTS unsigned int num_tracepoints; tracepoint_ptr_t *tracepoints_ptrs; #endif #ifdef CONFIG_TREE_SRCU unsigned int num_srcu_structs; struct srcu_struct **srcu_struct_ptrs; #endif #ifdef CONFIG_BPF_EVENTS unsigned int num_bpf_raw_events; struct bpf_raw_event_map *bpf_raw_events; #endif #ifdef CONFIG_DEBUG_INFO_BTF_MODULES unsigned int btf_data_size; void *btf_data; #endif #ifdef CONFIG_JUMP_LABEL struct jump_entry *jump_entries; unsigned int num_jump_entries; #endif #ifdef CONFIG_TRACING unsigned int num_trace_bprintk_fmt; const char **trace_bprintk_fmt_start; #endif #ifdef CONFIG_EVENT_TRACING struct trace_event_call **trace_events; unsigned int num_trace_events; struct trace_eval_map **trace_evals; unsigned int num_trace_evals; #endif #ifdef CONFIG_FTRACE_MCOUNT_RECORD unsigned int num_ftrace_callsites; unsigned long *ftrace_callsites; #endif #ifdef CONFIG_KPROBES void *kprobes_text_start; unsigned int kprobes_text_size; unsigned long *kprobe_blacklist; unsigned int num_kprobe_blacklist; #endif #ifdef CONFIG_HAVE_STATIC_CALL_INLINE int num_static_call_sites; struct static_call_site *static_call_sites; #endif #if IS_ENABLED(CONFIG_KUNIT) int num_kunit_init_suites; struct kunit_suite **kunit_init_suites; int num_kunit_suites; struct kunit_suite **kunit_suites; #endif #ifdef CONFIG_LIVEPATCH bool klp; /* Is this a livepatch module? */ bool klp_alive; /* ELF information */ struct klp_modinfo *klp_info; #endif #ifdef CONFIG_PRINTK_INDEX unsigned int printk_index_size; struct pi_entry **printk_index_start; #endif #ifdef CONFIG_MODULE_UNLOAD /* What modules depend on me? */ struct list_head source_list; /* What modules do I depend on? */ struct list_head target_list; /* Destruction function. */ void (*exit)(void); atomic_t refcnt; #endif #ifdef CONFIG_CONSTRUCTORS /* Constructor functions. */ ctor_fn_t *ctors; unsigned int num_ctors; #endif #ifdef CONFIG_FUNCTION_ERROR_INJECTION struct error_injection_entry *ei_funcs; unsigned int num_ei_funcs; #endif #ifdef CONFIG_DYNAMIC_DEBUG_CORE struct _ddebug_info dyndbg_info; #endif } ____cacheline_aligned __randomize_layout; #ifndef MODULE_ARCH_INIT #define MODULE_ARCH_INIT {} #endif #ifndef HAVE_ARCH_KALLSYMS_SYMBOL_VALUE static inline unsigned long kallsyms_symbol_value(const Elf_Sym *sym) { return sym->st_value; } #endif /* FIXME: It'd be nice to isolate modules during init, too, so they aren't used before they (may) fail. But presently too much code (IDE & SCSI) require entry into the module during init.*/ static inline bool module_is_live(struct module *mod) { return mod->state != MODULE_STATE_GOING; } struct module *__module_text_address(unsigned long addr); struct module *__module_address(unsigned long addr); bool is_module_address(unsigned long addr); bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr); bool is_module_percpu_address(unsigned long addr); bool is_module_text_address(unsigned long addr); static inline bool within_module_mem_type(unsigned long addr, const struct module *mod, enum mod_mem_type type) { unsigned long base, size; base = (unsigned long)mod->mem[type].base; size = mod->mem[type].size; return addr - base < size; } static inline bool within_module_core(unsigned long addr, const struct module *mod) { for_class_mod_mem_type(type, core) { if (within_module_mem_type(addr, mod, type)) return true; } return false; } static inline bool within_module_init(unsigned long addr, const struct module *mod) { for_class_mod_mem_type(type, init) { if (within_module_mem_type(addr, mod, type)) return true; } return false; } static inline bool within_module(unsigned long addr, const struct module *mod) { return within_module_init(addr, mod) || within_module_core(addr, mod); } /* Search for module by name: must be in a RCU-sched critical section. */ struct module *find_module(const char *name); extern void __noreturn __module_put_and_kthread_exit(struct module *mod, long code); #define module_put_and_kthread_exit(code) __module_put_and_kthread_exit(THIS_MODULE, code) #ifdef CONFIG_MODULE_UNLOAD int module_refcount(struct module *mod); void __symbol_put(const char *symbol); #define symbol_put(x) __symbol_put(__stringify(x)) void symbol_put_addr(void *addr); /* Sometimes we know we already have a refcount, and it's easier not to handle the error case (which only happens with rmmod --wait). */ extern void __module_get(struct module *module); /** * try_module_get() - take module refcount unless module is being removed * @module: the module we should check for * * Only try to get a module reference count if the module is not being removed. * This call will fail if the module is in the process of being removed. * * Care must also be taken to ensure the module exists and is alive prior to * usage of this call. This can be gauranteed through two means: * * 1) Direct protection: you know an earlier caller must have increased the * module reference through __module_get(). This can typically be achieved * by having another entity other than the module itself increment the * module reference count. * * 2) Implied protection: there is an implied protection against module * removal. An example of this is the implied protection used by kernfs / * sysfs. The sysfs store / read file operations are guaranteed to exist * through the use of kernfs's active reference (see kernfs_active()) and a * sysfs / kernfs file removal cannot happen unless the same file is not * active. Therefore, if a sysfs file is being read or written to the module * which created it must still exist. It is therefore safe to use * try_module_get() on module sysfs store / read ops. * * One of the real values to try_module_get() is the module_is_live() check * which ensures that the caller of try_module_get() can yield to userspace * module removal requests and gracefully fail if the module is on its way out. * * Returns true if the reference count was successfully incremented. */ extern bool try_module_get(struct module *module); /** * module_put() - release a reference count to a module * @module: the module we should release a reference count for * * If you successfully bump a reference count to a module with try_module_get(), * when you are finished you must call module_put() to release that reference * count. */ extern void module_put(struct module *module); #else /*!CONFIG_MODULE_UNLOAD*/ static inline bool try_module_get(struct module *module) { return !module || module_is_live(module); } static inline void module_put(struct module *module) { } static inline void __module_get(struct module *module) { } #define symbol_put(x) do { } while (0) #define symbol_put_addr(p) do { } while (0) #endif /* CONFIG_MODULE_UNLOAD */ /* This is a #define so the string doesn't get put in every .o file */ #define module_name(mod) \ ({ \ struct module *__mod = (mod); \ __mod ? __mod->name : "kernel"; \ }) /* Dereference module function descriptor */ void *dereference_module_function_descriptor(struct module *mod, void *ptr); int register_module_notifier(struct notifier_block *nb); int unregister_module_notifier(struct notifier_block *nb); extern void print_modules(void); static inline bool module_requested_async_probing(struct module *module) { return module && module->async_probe_requested; } static inline bool is_livepatch_module(struct module *mod) { #ifdef CONFIG_LIVEPATCH return mod->klp; #else return false; #endif } void set_module_sig_enforced(void); #else /* !CONFIG_MODULES... */ static inline struct module *__module_address(unsigned long addr) { return NULL; } static inline struct module *__module_text_address(unsigned long addr) { return NULL; } static inline bool is_module_address(unsigned long addr) { return false; } static inline bool is_module_percpu_address(unsigned long addr) { return false; } static inline bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr) { return false; } static inline bool is_module_text_address(unsigned long addr) { return false; } static inline bool within_module_core(unsigned long addr, const struct module *mod) { return false; } static inline bool within_module_init(unsigned long addr, const struct module *mod) { return false; } static inline bool within_module(unsigned long addr, const struct module *mod) { return false; } /* Get/put a kernel symbol (calls should be symmetric) */ #define symbol_get(x) ({ extern typeof(x) x __attribute__((weak,visibility("hidden"))); &(x); }) #define symbol_put(x) do { } while (0) #define symbol_put_addr(x) do { } while (0) static inline void __module_get(struct module *module) { } static inline bool try_module_get(struct module *module) { return true; } static inline void module_put(struct module *module) { } #define module_name(mod) "kernel" static inline int register_module_notifier(struct notifier_block *nb) { /* no events will happen anyway, so this can always succeed */ return 0; } static inline int unregister_module_notifier(struct notifier_block *nb) { return 0; } #define module_put_and_kthread_exit(code) kthread_exit(code) static inline void print_modules(void) { } static inline bool module_requested_async_probing(struct module *module) { return false; } static inline void set_module_sig_enforced(void) { } /* Dereference module function descriptor */ static inline void *dereference_module_function_descriptor(struct module *mod, void *ptr) { return ptr; } #endif /* CONFIG_MODULES */ #ifdef CONFIG_SYSFS extern struct kset *module_kset; extern const struct kobj_type module_ktype; #endif /* CONFIG_SYSFS */ #define symbol_request(x) try_then_request_module(symbol_get(x), "symbol:" #x) /* BELOW HERE ALL THESE ARE OBSOLETE AND WILL VANISH */ #define __MODULE_STRING(x) __stringify(x) #ifdef CONFIG_GENERIC_BUG void module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *, struct module *); void module_bug_cleanup(struct module *); #else /* !CONFIG_GENERIC_BUG */ static inline void module_bug_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mod) { } static inline void module_bug_cleanup(struct module *mod) {} #endif /* CONFIG_GENERIC_BUG */ #ifdef CONFIG_MITIGATION_RETPOLINE extern bool retpoline_module_ok(bool has_retpoline); #else static inline bool retpoline_module_ok(bool has_retpoline) { return true; } #endif #ifdef CONFIG_MODULE_SIG bool is_module_sig_enforced(void); static inline bool module_sig_ok(struct module *module) { return module->sig_ok; } #else /* !CONFIG_MODULE_SIG */ static inline bool is_module_sig_enforced(void) { return false; } static inline bool module_sig_ok(struct module *module) { return true; } #endif /* CONFIG_MODULE_SIG */ #if defined(CONFIG_MODULES) && defined(CONFIG_KALLSYMS) int module_kallsyms_on_each_symbol(const char *modname, int (*fn)(void *, const char *, unsigned long), void *data); /* For kallsyms to ask for address resolution. namebuf should be at * least KSYM_NAME_LEN long: a pointer to namebuf is returned if * found, otherwise NULL. */ const char *module_address_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, const unsigned char **modbuildid, char *namebuf); int lookup_module_symbol_name(unsigned long addr, char *symname); int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name); /* Returns 0 and fills in value, defined and namebuf, or -ERANGE if * symnum out of range. */ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *name, char *module_name, int *exported); /* Look for this name: can be of form module:name. */ unsigned long module_kallsyms_lookup_name(const char *name); unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name); #else /* CONFIG_MODULES && CONFIG_KALLSYMS */ static inline int module_kallsyms_on_each_symbol(const char *modname, int (*fn)(void *, const char *, unsigned long), void *data) { return -EOPNOTSUPP; } /* For kallsyms to ask for address resolution. NULL means not found. */ static inline const char *module_address_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, const unsigned char **modbuildid, char *namebuf) { return NULL; } static inline int lookup_module_symbol_name(unsigned long addr, char *symname) { return -ERANGE; } static inline int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *name, char *module_name, int *exported) { return -ERANGE; } static inline unsigned long module_kallsyms_lookup_name(const char *name) { return 0; } static inline unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name) { return 0; } #endif /* CONFIG_MODULES && CONFIG_KALLSYMS */ #endif /* _LINUX_MODULE_H */ |
12 7 12 3 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 | /* SPDX-License-Identifier: GPL-2.0-only */ #undef TRACE_SYSTEM #define TRACE_SYSTEM l2tp #if !defined(_TRACE_L2TP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_L2TP_H #include <linux/tracepoint.h> #include <linux/l2tp.h> #include "l2tp_core.h" #define encap_type_name(e) { L2TP_ENCAPTYPE_##e, #e } #define show_encap_type_name(val) \ __print_symbolic(val, \ encap_type_name(UDP), \ encap_type_name(IP)) #define pw_type_name(p) { L2TP_PWTYPE_##p, #p } #define show_pw_type_name(val) \ __print_symbolic(val, \ pw_type_name(ETH_VLAN), \ pw_type_name(ETH), \ pw_type_name(PPP), \ pw_type_name(PPP_AC), \ pw_type_name(IP)) DECLARE_EVENT_CLASS(tunnel_only_evt, TP_PROTO(struct l2tp_tunnel *tunnel), TP_ARGS(tunnel), TP_STRUCT__entry( __array(char, name, L2TP_TUNNEL_NAME_MAX) ), TP_fast_assign( memcpy(__entry->name, tunnel->name, L2TP_TUNNEL_NAME_MAX); ), TP_printk("%s", __entry->name) ); DECLARE_EVENT_CLASS(session_only_evt, TP_PROTO(struct l2tp_session *session), TP_ARGS(session), TP_STRUCT__entry( __array(char, name, L2TP_SESSION_NAME_MAX) ), TP_fast_assign( memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX); ), TP_printk("%s", __entry->name) ); TRACE_EVENT(register_tunnel, TP_PROTO(struct l2tp_tunnel *tunnel), TP_ARGS(tunnel), TP_STRUCT__entry( __array(char, name, L2TP_TUNNEL_NAME_MAX) __field(int, fd) __field(u32, tid) __field(u32, ptid) __field(int, version) __field(enum l2tp_encap_type, encap) ), TP_fast_assign( memcpy(__entry->name, tunnel->name, L2TP_TUNNEL_NAME_MAX); __entry->fd = tunnel->fd; __entry->tid = tunnel->tunnel_id; __entry->ptid = tunnel->peer_tunnel_id; __entry->version = tunnel->version; __entry->encap = tunnel->encap; ), TP_printk("%s: type=%s encap=%s version=L2TPv%d tid=%u ptid=%u fd=%d", __entry->name, __entry->fd > 0 ? "managed" : "unmanaged", show_encap_type_name(__entry->encap), __entry->version, __entry->tid, __entry->ptid, __entry->fd) ); DEFINE_EVENT(tunnel_only_evt, delete_tunnel, TP_PROTO(struct l2tp_tunnel *tunnel), TP_ARGS(tunnel) ); DEFINE_EVENT(tunnel_only_evt, free_tunnel, TP_PROTO(struct l2tp_tunnel *tunnel), TP_ARGS(tunnel) ); TRACE_EVENT(register_session, TP_PROTO(struct l2tp_session *session), TP_ARGS(session), TP_STRUCT__entry( __array(char, name, L2TP_SESSION_NAME_MAX) __field(u32, tid) __field(u32, ptid) __field(u32, sid) __field(u32, psid) __field(enum l2tp_pwtype, pwtype) ), TP_fast_assign( memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX); __entry->tid = session->tunnel ? session->tunnel->tunnel_id : 0; __entry->ptid = session->tunnel ? session->tunnel->peer_tunnel_id : 0; __entry->sid = session->session_id; __entry->psid = session->peer_session_id; __entry->pwtype = session->pwtype; ), TP_printk("%s: pseudowire=%s sid=%u psid=%u tid=%u ptid=%u", __entry->name, show_pw_type_name(__entry->pwtype), __entry->sid, __entry->psid, __entry->sid, __entry->psid) ); DEFINE_EVENT(session_only_evt, delete_session, TP_PROTO(struct l2tp_session *session), TP_ARGS(session) ); DEFINE_EVENT(session_only_evt, free_session, TP_PROTO(struct l2tp_session *session), TP_ARGS(session) ); DEFINE_EVENT(session_only_evt, session_seqnum_lns_enable, TP_PROTO(struct l2tp_session *session), TP_ARGS(session) ); DEFINE_EVENT(session_only_evt, session_seqnum_lns_disable, TP_PROTO(struct l2tp_session *session), TP_ARGS(session) ); DECLARE_EVENT_CLASS(session_seqnum_evt, TP_PROTO(struct l2tp_session *session), TP_ARGS(session), TP_STRUCT__entry( __array(char, name, L2TP_SESSION_NAME_MAX) __field(u32, ns) __field(u32, nr) ), TP_fast_assign( memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX); __entry->ns = session->ns; __entry->nr = session->nr; ), TP_printk("%s: ns=%u nr=%u", __entry->name, __entry->ns, __entry->nr) ); DEFINE_EVENT(session_seqnum_evt, session_seqnum_update, TP_PROTO(struct l2tp_session *session), TP_ARGS(session) ); DEFINE_EVENT(session_seqnum_evt, session_seqnum_reset, TP_PROTO(struct l2tp_session *session), TP_ARGS(session) ); DECLARE_EVENT_CLASS(session_pkt_discard_evt, TP_PROTO(struct l2tp_session *session, u32 pkt_ns), TP_ARGS(session, pkt_ns), TP_STRUCT__entry( __array(char, name, L2TP_SESSION_NAME_MAX) __field(u32, pkt_ns) __field(u32, my_nr) __field(u32, reorder_q_len) ), TP_fast_assign( memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX); __entry->pkt_ns = pkt_ns, __entry->my_nr = session->nr; __entry->reorder_q_len = skb_queue_len(&session->reorder_q); ), TP_printk("%s: pkt_ns=%u my_nr=%u reorder_q_len=%u", __entry->name, __entry->pkt_ns, __entry->my_nr, __entry->reorder_q_len) ); DEFINE_EVENT(session_pkt_discard_evt, session_pkt_expired, TP_PROTO(struct l2tp_session *session, u32 pkt_ns), TP_ARGS(session, pkt_ns) ); DEFINE_EVENT(session_pkt_discard_evt, session_pkt_outside_rx_window, TP_PROTO(struct l2tp_session *session, u32 pkt_ns), TP_ARGS(session, pkt_ns) ); DEFINE_EVENT(session_pkt_discard_evt, session_pkt_oos, TP_PROTO(struct l2tp_session *session, u32 pkt_ns), TP_ARGS(session, pkt_ns) ); #endif /* _TRACE_L2TP_H */ /* This part must be outside protection */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace #include <trace/define_trace.h> |
5 2 3 8 8 1 5 5 1019 420 76 19 19 214 233 232 235 209 59 7 1970 18 1951 136 3 133 7 1 476 52 5 40 334 189 194 191 322 22 2 121 71 2 47 2 48 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds * * This file contains the interface functions for the various time related * system calls: time, stime, gettimeofday, settimeofday, adjtime * * Modification history: * * 1993-09-02 Philip Gladstone * Created file with time related functions from sched/core.c and adjtimex() * 1993-10-08 Torsten Duwe * adjtime interface update and CMOS clock write code * 1995-08-13 Torsten Duwe * kernel PLL updated to 1994-12-13 specs (rfc-1589) * 1999-01-16 Ulrich Windl * Introduced error checking for many cases in adjtimex(). * Updated NTP code according to technical memorandum Jan '96 * "A Kernel Model for Precision Timekeeping" by Dave Mills * Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10) * (Even though the technical memorandum forbids it) * 2004-07-14 Christoph Lameter * Added getnstimeofday to allow the posix timer functions to return * with nanosecond accuracy */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/timex.h> #include <linux/capability.h> #include <linux/timekeeper_internal.h> #include <linux/errno.h> #include <linux/syscalls.h> #include <linux/security.h> #include <linux/fs.h> #include <linux/math64.h> #include <linux/ptrace.h> #include <linux/uaccess.h> #include <linux/compat.h> #include <asm/unistd.h> #include <generated/timeconst.h> #include "timekeeping.h" /* * The timezone where the local system is located. Used as a default by some * programs who obtain this value by using gettimeofday. */ struct timezone sys_tz; EXPORT_SYMBOL(sys_tz); #ifdef __ARCH_WANT_SYS_TIME /* * sys_time() can be implemented in user-level using * sys_gettimeofday(). Is this for backwards compatibility? If so, * why not move it into the appropriate arch directory (for those * architectures that need it). */ SYSCALL_DEFINE1(time, __kernel_old_time_t __user *, tloc) { __kernel_old_time_t i = (__kernel_old_time_t)ktime_get_real_seconds(); if (tloc) { if (put_user(i,tloc)) return -EFAULT; } force_successful_syscall_return(); return i; } /* * sys_stime() can be implemented in user-level using * sys_settimeofday(). Is this for backwards compatibility? If so, * why not move it into the appropriate arch directory (for those * architectures that need it). */ SYSCALL_DEFINE1(stime, __kernel_old_time_t __user *, tptr) { struct timespec64 tv; int err; if (get_user(tv.tv_sec, tptr)) return -EFAULT; tv.tv_nsec = 0; err = security_settime64(&tv, NULL); if (err) return err; do_settimeofday64(&tv); return 0; } #endif /* __ARCH_WANT_SYS_TIME */ #ifdef CONFIG_COMPAT_32BIT_TIME #ifdef __ARCH_WANT_SYS_TIME32 /* old_time32_t is a 32 bit "long" and needs to get converted. */ SYSCALL_DEFINE1(time32, old_time32_t __user *, tloc) { old_time32_t i; i = (old_time32_t)ktime_get_real_seconds(); if (tloc) { if (put_user(i,tloc)) return -EFAULT; } force_successful_syscall_return(); return i; } SYSCALL_DEFINE1(stime32, old_time32_t __user *, tptr) { struct timespec64 tv; int err; if (get_user(tv.tv_sec, tptr)) return -EFAULT; tv.tv_nsec = 0; err = security_settime64(&tv, NULL); if (err) return err; do_settimeofday64(&tv); return 0; } #endif /* __ARCH_WANT_SYS_TIME32 */ #endif SYSCALL_DEFINE2(gettimeofday, struct __kernel_old_timeval __user *, tv, struct timezone __user *, tz) { if (likely(tv != NULL)) { struct timespec64 ts; ktime_get_real_ts64(&ts); if (put_user(ts.tv_sec, &tv->tv_sec) || put_user(ts.tv_nsec / 1000, &tv->tv_usec)) return -EFAULT; } if (unlikely(tz != NULL)) { if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) return -EFAULT; } return 0; } /* * In case for some reason the CMOS clock has not already been running * in UTC, but in some local time: The first time we set the timezone, * we will warp the clock so that it is ticking UTC time instead of * local time. Presumably, if someone is setting the timezone then we * are running in an environment where the programs understand about * timezones. This should be done at boot time in the /etc/rc script, * as soon as possible, so that the clock can be set right. Otherwise, * various programs will get confused when the clock gets warped. */ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz) { static int firsttime = 1; int error = 0; if (tv && !timespec64_valid_settod(tv)) return -EINVAL; error = security_settime64(tv, tz); if (error) return error; if (tz) { /* Verify we're within the +-15 hrs range */ if (tz->tz_minuteswest > 15*60 || tz->tz_minuteswest < -15*60) return -EINVAL; sys_tz = *tz; update_vsyscall_tz(); if (firsttime) { firsttime = 0; if (!tv) timekeeping_warp_clock(); } } if (tv) return do_settimeofday64(tv); return 0; } SYSCALL_DEFINE2(settimeofday, struct __kernel_old_timeval __user *, tv, struct timezone __user *, tz) { struct timespec64 new_ts; struct timezone new_tz; if (tv) { if (get_user(new_ts.tv_sec, &tv->tv_sec) || get_user(new_ts.tv_nsec, &tv->tv_usec)) return -EFAULT; if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0) return -EINVAL; new_ts.tv_nsec *= NSEC_PER_USEC; } if (tz) { if (copy_from_user(&new_tz, tz, sizeof(*tz))) return -EFAULT; } return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(gettimeofday, struct old_timeval32 __user *, tv, struct timezone __user *, tz) { if (tv) { struct timespec64 ts; ktime_get_real_ts64(&ts); if (put_user(ts.tv_sec, &tv->tv_sec) || put_user(ts.tv_nsec / 1000, &tv->tv_usec)) return -EFAULT; } if (tz) { if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) return -EFAULT; } return 0; } COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv, struct timezone __user *, tz) { struct timespec64 new_ts; struct timezone new_tz; if (tv) { if (get_user(new_ts.tv_sec, &tv->tv_sec) || get_user(new_ts.tv_nsec, &tv->tv_usec)) return -EFAULT; if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0) return -EINVAL; new_ts.tv_nsec *= NSEC_PER_USEC; } if (tz) { if (copy_from_user(&new_tz, tz, sizeof(*tz))) return -EFAULT; } return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); } #endif #ifdef CONFIG_64BIT SYSCALL_DEFINE1(adjtimex, struct __kernel_timex __user *, txc_p) { struct __kernel_timex txc; /* Local copy of parameter */ int ret; /* Copy the user data space into the kernel copy * structure. But bear in mind that the structures * may change */ if (copy_from_user(&txc, txc_p, sizeof(struct __kernel_timex))) return -EFAULT; ret = do_adjtimex(&txc); return copy_to_user(txc_p, &txc, sizeof(struct __kernel_timex)) ? -EFAULT : ret; } #endif #ifdef CONFIG_COMPAT_32BIT_TIME int get_old_timex32(struct __kernel_timex *txc, const struct old_timex32 __user *utp) { struct old_timex32 tx32; memset(txc, 0, sizeof(struct __kernel_timex)); if (copy_from_user(&tx32, utp, sizeof(struct old_timex32))) return -EFAULT; txc->modes = tx32.modes; txc->offset = tx32.offset; txc->freq = tx32.freq; txc->maxerror = tx32.maxerror; txc->esterror = tx32.esterror; txc->status = tx32.status; txc->constant = tx32.constant; txc->precision = tx32.precision; txc->tolerance = tx32.tolerance; txc->time.tv_sec = tx32.time.tv_sec; txc->time.tv_usec = tx32.time.tv_usec; txc->tick = tx32.tick; txc->ppsfreq = tx32.ppsfreq; txc->jitter = tx32.jitter; txc->shift = tx32.shift; txc->stabil = tx32.stabil; txc->jitcnt = tx32.jitcnt; txc->calcnt = tx32.calcnt; txc->errcnt = tx32.errcnt; txc->stbcnt = tx32.stbcnt; return 0; } int put_old_timex32(struct old_timex32 __user *utp, const struct __kernel_timex *txc) { struct old_timex32 tx32; memset(&tx32, 0, sizeof(struct old_timex32)); tx32.modes = txc->modes; tx32.offset = txc->offset; tx32.freq = txc->freq; tx32.maxerror = txc->maxerror; tx32.esterror = txc->esterror; tx32.status = txc->status; tx32.constant = txc->constant; tx32.precision = txc->precision; tx32.tolerance = txc->tolerance; tx32.time.tv_sec = txc->time.tv_sec; tx32.time.tv_usec = txc->time.tv_usec; tx32.tick = txc->tick; tx32.ppsfreq = txc->ppsfreq; tx32.jitter = txc->jitter; tx32.shift = txc->shift; tx32.stabil = txc->stabil; tx32.jitcnt = txc->jitcnt; tx32.calcnt = txc->calcnt; tx32.errcnt = txc->errcnt; tx32.stbcnt = txc->stbcnt; tx32.tai = txc->tai; if (copy_to_user(utp, &tx32, sizeof(struct old_timex32))) return -EFAULT; return 0; } SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp) { struct __kernel_timex txc; int err, ret; err = get_old_timex32(&txc, utp); if (err) return err; ret = do_adjtimex(&txc); err = put_old_timex32(utp, &txc); if (err) return err; return ret; } #endif /** * jiffies_to_msecs - Convert jiffies to milliseconds * @j: jiffies value * * Avoid unnecessary multiplications/divisions in the * two most common HZ cases. * * Return: milliseconds value */ unsigned int jiffies_to_msecs(const unsigned long j) { #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) return (MSEC_PER_SEC / HZ) * j; #elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); #else # if BITS_PER_LONG == 32 return (HZ_TO_MSEC_MUL32 * j + (1ULL << HZ_TO_MSEC_SHR32) - 1) >> HZ_TO_MSEC_SHR32; # else return DIV_ROUND_UP(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN); # endif #endif } EXPORT_SYMBOL(jiffies_to_msecs); /** * jiffies_to_usecs - Convert jiffies to microseconds * @j: jiffies value * * Return: microseconds value */ unsigned int jiffies_to_usecs(const unsigned long j) { /* * Hz usually doesn't go much further MSEC_PER_SEC. * jiffies_to_usecs() and usecs_to_jiffies() depend on that. */ BUILD_BUG_ON(HZ > USEC_PER_SEC); #if !(USEC_PER_SEC % HZ) return (USEC_PER_SEC / HZ) * j; #else # if BITS_PER_LONG == 32 return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32; # else return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN; # endif #endif } EXPORT_SYMBOL(jiffies_to_usecs); /** * mktime64 - Converts date to seconds. * @year0: year to convert * @mon0: month to convert * @day: day to convert * @hour: hour to convert * @min: minute to convert * @sec: second to convert * * Converts Gregorian date to seconds since 1970-01-01 00:00:00. * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. * * [For the Julian calendar (which was used in Russia before 1917, * Britain & colonies before 1752, anywhere else before 1582, * and is still in use by some communities) leave out the * -year/100+year/400 terms, and add 10.] * * This algorithm was first published by Gauss (I think). * * A leap second can be indicated by calling this function with sec as * 60 (allowable under ISO 8601). The leap second is treated the same * as the following second since they don't exist in UNIX time. * * An encoding of midnight at the end of the day as 24:00:00 - ie. midnight * tomorrow - (allowable under ISO 8601) is supported. * * Return: seconds since the epoch time for the given input date */ time64_t mktime64(const unsigned int year0, const unsigned int mon0, const unsigned int day, const unsigned int hour, const unsigned int min, const unsigned int sec) { unsigned int mon = mon0, year = year0; /* 1..12 -> 11,12,1..10 */ if (0 >= (int) (mon -= 2)) { mon += 12; /* Puts Feb last since it has leap day */ year -= 1; } return ((((time64_t) (year/4 - year/100 + year/400 + 367*mon/12 + day) + year*365 - 719499 )*24 + hour /* now have hours - midnight tomorrow handled here */ )*60 + min /* now have minutes */ )*60 + sec; /* finally seconds */ } EXPORT_SYMBOL(mktime64); struct __kernel_old_timeval ns_to_kernel_old_timeval(s64 nsec) { struct timespec64 ts = ns_to_timespec64(nsec); struct __kernel_old_timeval tv; tv.tv_sec = ts.tv_sec; tv.tv_usec = (suseconds_t)ts.tv_nsec / 1000; return tv; } EXPORT_SYMBOL(ns_to_kernel_old_timeval); /** * set_normalized_timespec64 - set timespec sec and nsec parts and normalize * * @ts: pointer to timespec variable to be set * @sec: seconds to set * @nsec: nanoseconds to set * * Set seconds and nanoseconds field of a timespec variable and * normalize to the timespec storage format * * Note: The tv_nsec part is always in the range of 0 <= tv_nsec < NSEC_PER_SEC. * For negative values only the tv_sec field is negative ! */ void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec) { while (nsec >= NSEC_PER_SEC) { /* * The following asm() prevents the compiler from * optimising this loop into a modulo operation. See * also __iter_div_u64_rem() in include/linux/time.h */ asm("" : "+rm"(nsec)); nsec -= NSEC_PER_SEC; ++sec; } while (nsec < 0) { asm("" : "+rm"(nsec)); nsec += NSEC_PER_SEC; --sec; } ts->tv_sec = sec; ts->tv_nsec = nsec; } EXPORT_SYMBOL(set_normalized_timespec64); /** * ns_to_timespec64 - Convert nanoseconds to timespec64 * @nsec: the nanoseconds value to be converted * * Return: the timespec64 representation of the nsec parameter. */ struct timespec64 ns_to_timespec64(s64 nsec) { struct timespec64 ts = { 0, 0 }; s32 rem; if (likely(nsec > 0)) { ts.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); ts.tv_nsec = rem; } else if (nsec < 0) { /* * With negative times, tv_sec points to the earlier * second, and tv_nsec counts the nanoseconds since * then, so tv_nsec is always a positive number. */ ts.tv_sec = -div_u64_rem(-nsec - 1, NSEC_PER_SEC, &rem) - 1; ts.tv_nsec = NSEC_PER_SEC - rem - 1; } return ts; } EXPORT_SYMBOL(ns_to_timespec64); /** * __msecs_to_jiffies: - convert milliseconds to jiffies * @m: time in milliseconds * * conversion is done as follows: * * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) * * - 'too large' values [that would result in larger than * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. * * - all other values are converted to jiffies by either multiplying * the input value by a factor or dividing it with a factor and * handling any 32-bit overflows. * for the details see __msecs_to_jiffies() * * __msecs_to_jiffies() checks for the passed in value being a constant * via __builtin_constant_p() allowing gcc to eliminate most of the * code, __msecs_to_jiffies() is called if the value passed does not * allow constant folding and the actual conversion must be done at * runtime. * The _msecs_to_jiffies helpers are the HZ dependent conversion * routines found in include/linux/jiffies.h * * Return: jiffies value */ unsigned long __msecs_to_jiffies(const unsigned int m) { /* * Negative value, means infinite timeout: */ if ((int)m < 0) return MAX_JIFFY_OFFSET; return _msecs_to_jiffies(m); } EXPORT_SYMBOL(__msecs_to_jiffies); /** * __usecs_to_jiffies: - convert microseconds to jiffies * @u: time in milliseconds * * Return: jiffies value */ unsigned long __usecs_to_jiffies(const unsigned int u) { if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) return MAX_JIFFY_OFFSET; return _usecs_to_jiffies(u); } EXPORT_SYMBOL(__usecs_to_jiffies); /** * timespec64_to_jiffies - convert a timespec64 value to jiffies * @value: pointer to &struct timespec64 * * The TICK_NSEC - 1 rounds up the value to the next resolution. Note * that a remainder subtract here would not do the right thing as the * resolution values don't fall on second boundaries. I.e. the line: * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. * Note that due to the small error in the multiplier here, this * rounding is incorrect for sufficiently large values of tv_nsec, but * well formed timespecs should have tv_nsec < NSEC_PER_SEC, so we're * OK. * * Rather, we just shift the bits off the right. * * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec * value to a scaled second value. * * Return: jiffies value */ unsigned long timespec64_to_jiffies(const struct timespec64 *value) { u64 sec = value->tv_sec; long nsec = value->tv_nsec + TICK_NSEC - 1; if (sec >= MAX_SEC_IN_JIFFIES){ sec = MAX_SEC_IN_JIFFIES; nsec = 0; } return ((sec * SEC_CONVERSION) + (((u64)nsec * NSEC_CONVERSION) >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; } EXPORT_SYMBOL(timespec64_to_jiffies); /** * jiffies_to_timespec64 - convert jiffies value to &struct timespec64 * @jiffies: jiffies value * @value: pointer to &struct timespec64 */ void jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value) { /* * Convert jiffies to nanoseconds and separate with * one divide. */ u32 rem; value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC, NSEC_PER_SEC, &rem); value->tv_nsec = rem; } EXPORT_SYMBOL(jiffies_to_timespec64); /* * Convert jiffies/jiffies_64 to clock_t and back. */ /** * jiffies_to_clock_t - Convert jiffies to clock_t * @x: jiffies value * * Return: jiffies converted to clock_t (CLOCKS_PER_SEC) */ clock_t jiffies_to_clock_t(unsigned long x) { #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 # if HZ < USER_HZ return x * (USER_HZ / HZ); # else return x / (HZ / USER_HZ); # endif #else return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ); #endif } EXPORT_SYMBOL(jiffies_to_clock_t); /** * clock_t_to_jiffies - Convert clock_t to jiffies * @x: clock_t value * * Return: clock_t value converted to jiffies */ unsigned long clock_t_to_jiffies(unsigned long x) { #if (HZ % USER_HZ)==0 if (x >= ~0UL / (HZ / USER_HZ)) return ~0UL; return x * (HZ / USER_HZ); #else /* Don't worry about loss of precision here .. */ if (x >= ~0UL / HZ * USER_HZ) return ~0UL; /* .. but do try to contain it here */ return div_u64((u64)x * HZ, USER_HZ); #endif } EXPORT_SYMBOL(clock_t_to_jiffies); /** * jiffies_64_to_clock_t - Convert jiffies_64 to clock_t * @x: jiffies_64 value * * Return: jiffies_64 value converted to 64-bit "clock_t" (CLOCKS_PER_SEC) */ u64 jiffies_64_to_clock_t(u64 x) { #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 # if HZ < USER_HZ x = div_u64(x * USER_HZ, HZ); # elif HZ > USER_HZ x = div_u64(x, HZ / USER_HZ); # else /* Nothing to do */ # endif #else /* * There are better ways that don't overflow early, * but even this doesn't overflow in hundreds of years * in 64 bits, so.. */ x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ)); #endif return x; } EXPORT_SYMBOL(jiffies_64_to_clock_t); /** * nsec_to_clock_t - Convert nsec value to clock_t * @x: nsec value * * Return: nsec value converted to 64-bit "clock_t" (CLOCKS_PER_SEC) */ u64 nsec_to_clock_t(u64 x) { #if (NSEC_PER_SEC % USER_HZ) == 0 return div_u64(x, NSEC_PER_SEC / USER_HZ); #elif (USER_HZ % 512) == 0 return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512); #else /* * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, * overflow after 64.99 years. * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... */ return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ); #endif } /** * jiffies64_to_nsecs - Convert jiffies64 to nanoseconds * @j: jiffies64 value * * Return: nanoseconds value */ u64 jiffies64_to_nsecs(u64 j) { #if !(NSEC_PER_SEC % HZ) return (NSEC_PER_SEC / HZ) * j; # else return div_u64(j * HZ_TO_NSEC_NUM, HZ_TO_NSEC_DEN); #endif } EXPORT_SYMBOL(jiffies64_to_nsecs); /** * jiffies64_to_msecs - Convert jiffies64 to milliseconds * @j: jiffies64 value * * Return: milliseconds value */ u64 jiffies64_to_msecs(const u64 j) { #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) return (MSEC_PER_SEC / HZ) * j; #else return div_u64(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN); #endif } EXPORT_SYMBOL(jiffies64_to_msecs); /** * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64 * * @n: nsecs in u64 * * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. * And this doesn't return MAX_JIFFY_OFFSET since this function is designed * for scheduler, not for use in device drivers to calculate timeout value. * * note: * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years * * Return: nsecs converted to jiffies64 value */ u64 nsecs_to_jiffies64(u64 n) { #if (NSEC_PER_SEC % HZ) == 0 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ return div_u64(n, NSEC_PER_SEC / HZ); #elif (HZ % 512) == 0 /* overflow after 292 years if HZ = 1024 */ return div_u64(n * HZ / 512, NSEC_PER_SEC / 512); #else /* * Generic case - optimized for cases where HZ is a multiple of 3. * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc. */ return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ); #endif } EXPORT_SYMBOL(nsecs_to_jiffies64); /** * nsecs_to_jiffies - Convert nsecs in u64 to jiffies * * @n: nsecs in u64 * * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. * And this doesn't return MAX_JIFFY_OFFSET since this function is designed * for scheduler, not for use in device drivers to calculate timeout value. * * note: * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years * * Return: nsecs converted to jiffies value */ unsigned long nsecs_to_jiffies(u64 n) { return (unsigned long)nsecs_to_jiffies64(n); } EXPORT_SYMBOL_GPL(nsecs_to_jiffies); /** * timespec64_add_safe - Add two timespec64 values and do a safety check * for overflow. * @lhs: first (left) timespec64 to add * @rhs: second (right) timespec64 to add * * It's assumed that both values are valid (>= 0). * And, each timespec64 is in normalized form. * * Return: sum of @lhs + @rhs */ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, const struct timespec64 rhs) { struct timespec64 res; set_normalized_timespec64(&res, (timeu64_t) lhs.tv_sec + rhs.tv_sec, lhs.tv_nsec + rhs.tv_nsec); if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) { res.tv_sec = TIME64_MAX; res.tv_nsec = 0; } return res; } /** * get_timespec64 - get user's time value into kernel space * @ts: destination &struct timespec64 * @uts: user's time value as &struct __kernel_timespec * * Handles compat or 32-bit modes. * * Return: %0 on success or negative errno on error */ int get_timespec64(struct timespec64 *ts, const struct __kernel_timespec __user *uts) { struct __kernel_timespec kts; int ret; ret = copy_from_user(&kts, uts, sizeof(kts)); if (ret) return -EFAULT; ts->tv_sec = kts.tv_sec; /* Zero out the padding in compat mode */ if (in_compat_syscall()) kts.tv_nsec &= 0xFFFFFFFFUL; /* In 32-bit mode, this drops the padding */ ts->tv_nsec = kts.tv_nsec; return 0; } EXPORT_SYMBOL_GPL(get_timespec64); /** * put_timespec64 - convert timespec64 value to __kernel_timespec format and * copy the latter to userspace * @ts: input &struct timespec64 * @uts: user's &struct __kernel_timespec * * Return: %0 on success or negative errno on error */ int put_timespec64(const struct timespec64 *ts, struct __kernel_timespec __user *uts) { struct __kernel_timespec kts = { .tv_sec = ts->tv_sec, .tv_nsec = ts->tv_nsec }; return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0; } EXPORT_SYMBOL_GPL(put_timespec64); static int __get_old_timespec32(struct timespec64 *ts64, const struct old_timespec32 __user *cts) { struct old_timespec32 ts; int ret; ret = copy_from_user(&ts, cts, sizeof(ts)); if (ret) return -EFAULT; ts64->tv_sec = ts.tv_sec; ts64->tv_nsec = ts.tv_nsec; return 0; } static int __put_old_timespec32(const struct timespec64 *ts64, struct old_timespec32 __user *cts) { struct old_timespec32 ts = { .tv_sec = ts64->tv_sec, .tv_nsec = ts64->tv_nsec }; return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0; } /** * get_old_timespec32 - get user's old-format time value into kernel space * @ts: destination &struct timespec64 * @uts: user's old-format time value (&struct old_timespec32) * * Handles X86_X32_ABI compatibility conversion. * * Return: %0 on success or negative errno on error */ int get_old_timespec32(struct timespec64 *ts, const void __user *uts) { if (COMPAT_USE_64BIT_TIME) return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; else return __get_old_timespec32(ts, uts); } EXPORT_SYMBOL_GPL(get_old_timespec32); /** * put_old_timespec32 - convert timespec64 value to &struct old_timespec32 and * copy the latter to userspace * @ts: input &struct timespec64 * @uts: user's &struct old_timespec32 * * Handles X86_X32_ABI compatibility conversion. * * Return: %0 on success or negative errno on error */ int put_old_timespec32(const struct timespec64 *ts, void __user *uts) { if (COMPAT_USE_64BIT_TIME) return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; else return __put_old_timespec32(ts, uts); } EXPORT_SYMBOL_GPL(put_old_timespec32); /** * get_itimerspec64 - get user's &struct __kernel_itimerspec into kernel space * @it: destination &struct itimerspec64 * @uit: user's &struct __kernel_itimerspec * * Return: %0 on success or negative errno on error */ int get_itimerspec64(struct itimerspec64 *it, const struct __kernel_itimerspec __user *uit) { int ret; ret = get_timespec64(&it->it_interval, &uit->it_interval); if (ret) return ret; ret = get_timespec64(&it->it_value, &uit->it_value); return ret; } EXPORT_SYMBOL_GPL(get_itimerspec64); /** * put_itimerspec64 - convert &struct itimerspec64 to __kernel_itimerspec format * and copy the latter to userspace * @it: input &struct itimerspec64 * @uit: user's &struct __kernel_itimerspec * * Return: %0 on success or negative errno on error */ int put_itimerspec64(const struct itimerspec64 *it, struct __kernel_itimerspec __user *uit) { int ret; ret = put_timespec64(&it->it_interval, &uit->it_interval); if (ret) return ret; ret = put_timespec64(&it->it_value, &uit->it_value); return ret; } EXPORT_SYMBOL_GPL(put_itimerspec64); /** * get_old_itimerspec32 - get user's &struct old_itimerspec32 into kernel space * @its: destination &struct itimerspec64 * @uits: user's &struct old_itimerspec32 * * Return: %0 on success or negative errno on error */ int get_old_itimerspec32(struct itimerspec64 *its, const struct old_itimerspec32 __user *uits) { if (__get_old_timespec32(&its->it_interval, &uits->it_interval) || __get_old_timespec32(&its->it_value, &uits->it_value)) return -EFAULT; return 0; } EXPORT_SYMBOL_GPL(get_old_itimerspec32); /** * put_old_itimerspec32 - convert &struct itimerspec64 to &struct * old_itimerspec32 and copy the latter to userspace * @its: input &struct itimerspec64 * @uits: user's &struct old_itimerspec32 * * Return: %0 on success or negative errno on error */ int put_old_itimerspec32(const struct itimerspec64 *its, struct old_itimerspec32 __user *uits) { if (__put_old_timespec32(&its->it_interval, &uits->it_interval) || __put_old_timespec32(&its->it_value, &uits->it_value)) return -EFAULT; return 0; } EXPORT_SYMBOL_GPL(put_old_itimerspec32); |
41 1 282 129 204 280 282 128 280 282 282 8 281 283 10 10 10 10 80 75 126 126 80 126 124 8 8 8 429 429 5 428 430 5 428 430 429 430 430 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 | // SPDX-License-Identifier: GPL-2.0-only /* * jump label support * * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> * Copyright (C) 2011 Peter Zijlstra * */ #include <linux/memory.h> #include <linux/uaccess.h> #include <linux/module.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/sort.h> #include <linux/err.h> #include <linux/static_key.h> #include <linux/jump_label_ratelimit.h> #include <linux/bug.h> #include <linux/cpu.h> #include <asm/sections.h> /* mutex to protect coming/going of the jump_label table */ static DEFINE_MUTEX(jump_label_mutex); void jump_label_lock(void) { mutex_lock(&jump_label_mutex); } void jump_label_unlock(void) { mutex_unlock(&jump_label_mutex); } static int jump_label_cmp(const void *a, const void *b) { const struct jump_entry *jea = a; const struct jump_entry *jeb = b; /* * Entrires are sorted by key. */ if (jump_entry_key(jea) < jump_entry_key(jeb)) return -1; if (jump_entry_key(jea) > jump_entry_key(jeb)) return 1; /* * In the batching mode, entries should also be sorted by the code * inside the already sorted list of entries, enabling a bsearch in * the vector. */ if (jump_entry_code(jea) < jump_entry_code(jeb)) return -1; if (jump_entry_code(jea) > jump_entry_code(jeb)) return 1; return 0; } static void jump_label_swap(void *a, void *b, int size) { long delta = (unsigned long)a - (unsigned long)b; struct jump_entry *jea = a; struct jump_entry *jeb = b; struct jump_entry tmp = *jea; jea->code = jeb->code - delta; jea->target = jeb->target - delta; jea->key = jeb->key - delta; jeb->code = tmp.code + delta; jeb->target = tmp.target + delta; jeb->key = tmp.key + delta; } static void jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) { unsigned long size; void *swapfn = NULL; if (IS_ENABLED(CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE)) swapfn = jump_label_swap; size = (((unsigned long)stop - (unsigned long)start) / sizeof(struct jump_entry)); sort(start, size, sizeof(struct jump_entry), jump_label_cmp, swapfn); } static void jump_label_update(struct static_key *key); /* * There are similar definitions for the !CONFIG_JUMP_LABEL case in jump_label.h. * The use of 'atomic_read()' requires atomic.h and its problematic for some * kernel headers such as kernel.h and others. Since static_key_count() is not * used in the branch statements as it is for the !CONFIG_JUMP_LABEL case its ok * to have it be a function here. Similarly, for 'static_key_enable()' and * 'static_key_disable()', which require bug.h. This should allow jump_label.h * to be included from most/all places for CONFIG_JUMP_LABEL. */ int static_key_count(struct static_key *key) { /* * -1 means the first static_key_slow_inc() is in progress. * static_key_enabled() must return true, so return 1 here. */ int n = atomic_read(&key->enabled); return n >= 0 ? n : 1; } EXPORT_SYMBOL_GPL(static_key_count); /* * static_key_fast_inc_not_disabled - adds a user for a static key * @key: static key that must be already enabled * * The caller must make sure that the static key can't get disabled while * in this function. It doesn't patch jump labels, only adds a user to * an already enabled static key. * * Returns true if the increment was done. Unlike refcount_t the ref counter * is not saturated, but will fail to increment on overflow. */ bool static_key_fast_inc_not_disabled(struct static_key *key) { int v; STATIC_KEY_CHECK_USE(key); /* * Negative key->enabled has a special meaning: it sends * static_key_slow_inc() down the slow path, and it is non-zero * so it counts as "enabled" in jump_label_update(). Note that * atomic_inc_unless_negative() checks >= 0, so roll our own. */ v = atomic_read(&key->enabled); do { if (v <= 0 || (v + 1) < 0) return false; } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1))); return true; } EXPORT_SYMBOL_GPL(static_key_fast_inc_not_disabled); bool static_key_slow_inc_cpuslocked(struct static_key *key) { lockdep_assert_cpus_held(); /* * Careful if we get concurrent static_key_slow_inc() calls; * later calls must wait for the first one to _finish_ the * jump_label_update() process. At the same time, however, * the jump_label_update() call below wants to see * static_key_enabled(&key) for jumps to be updated properly. */ if (static_key_fast_inc_not_disabled(key)) return true; jump_label_lock(); if (atomic_read(&key->enabled) == 0) { atomic_set(&key->enabled, -1); jump_label_update(key); /* * Ensure that if the above cmpxchg loop observes our positive * value, it must also observe all the text changes. */ atomic_set_release(&key->enabled, 1); } else { if (WARN_ON_ONCE(!static_key_fast_inc_not_disabled(key))) { jump_label_unlock(); return false; } } jump_label_unlock(); return true; } bool static_key_slow_inc(struct static_key *key) { bool ret; cpus_read_lock(); ret = static_key_slow_inc_cpuslocked(key); cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(static_key_slow_inc); void static_key_enable_cpuslocked(struct static_key *key) { STATIC_KEY_CHECK_USE(key); lockdep_assert_cpus_held(); if (atomic_read(&key->enabled) > 0) { WARN_ON_ONCE(atomic_read(&key->enabled) != 1); return; } jump_label_lock(); if (atomic_read(&key->enabled) == 0) { atomic_set(&key->enabled, -1); jump_label_update(key); /* * See static_key_slow_inc(). */ atomic_set_release(&key->enabled, 1); } jump_label_unlock(); } EXPORT_SYMBOL_GPL(static_key_enable_cpuslocked); void static_key_enable(struct static_key *key) { cpus_read_lock(); static_key_enable_cpuslocked(key); cpus_read_unlock(); } EXPORT_SYMBOL_GPL(static_key_enable); void static_key_disable_cpuslocked(struct static_key *key) { STATIC_KEY_CHECK_USE(key); lockdep_assert_cpus_held(); if (atomic_read(&key->enabled) != 1) { WARN_ON_ONCE(atomic_read(&key->enabled) != 0); return; } jump_label_lock(); if (atomic_cmpxchg(&key->enabled, 1, 0)) jump_label_update(key); jump_label_unlock(); } EXPORT_SYMBOL_GPL(static_key_disable_cpuslocked); void static_key_disable(struct static_key *key) { cpus_read_lock(); static_key_disable_cpuslocked(key); cpus_read_unlock(); } EXPORT_SYMBOL_GPL(static_key_disable); static bool static_key_slow_try_dec(struct static_key *key) { int val; val = atomic_fetch_add_unless(&key->enabled, -1, 1); if (val == 1) return false; /* * The negative count check is valid even when a negative * key->enabled is in use by static_key_slow_inc(); a * __static_key_slow_dec() before the first static_key_slow_inc() * returns is unbalanced, because all other static_key_slow_inc() * instances block while the update is in progress. */ WARN(val < 0, "jump label: negative count!\n"); return true; } static void __static_key_slow_dec_cpuslocked(struct static_key *key) { lockdep_assert_cpus_held(); if (static_key_slow_try_dec(key)) return; jump_label_lock(); if (atomic_dec_and_test(&key->enabled)) jump_label_update(key); jump_label_unlock(); } static void __static_key_slow_dec(struct static_key *key) { cpus_read_lock(); __static_key_slow_dec_cpuslocked(key); cpus_read_unlock(); } void jump_label_update_timeout(struct work_struct *work) { struct static_key_deferred *key = container_of(work, struct static_key_deferred, work.work); __static_key_slow_dec(&key->key); } EXPORT_SYMBOL_GPL(jump_label_update_timeout); void static_key_slow_dec(struct static_key *key) { STATIC_KEY_CHECK_USE(key); __static_key_slow_dec(key); } EXPORT_SYMBOL_GPL(static_key_slow_dec); void static_key_slow_dec_cpuslocked(struct static_key *key) { STATIC_KEY_CHECK_USE(key); __static_key_slow_dec_cpuslocked(key); } void __static_key_slow_dec_deferred(struct static_key *key, struct delayed_work *work, unsigned long timeout) { STATIC_KEY_CHECK_USE(key); if (static_key_slow_try_dec(key)) return; schedule_delayed_work(work, timeout); } EXPORT_SYMBOL_GPL(__static_key_slow_dec_deferred); void __static_key_deferred_flush(void *key, struct delayed_work *work) { STATIC_KEY_CHECK_USE(key); flush_delayed_work(work); } EXPORT_SYMBOL_GPL(__static_key_deferred_flush); void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) { STATIC_KEY_CHECK_USE(key); key->timeout = rl; INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); } EXPORT_SYMBOL_GPL(jump_label_rate_limit); static int addr_conflict(struct jump_entry *entry, void *start, void *end) { if (jump_entry_code(entry) <= (unsigned long)end && jump_entry_code(entry) + jump_entry_size(entry) > (unsigned long)start) return 1; return 0; } static int __jump_label_text_reserved(struct jump_entry *iter_start, struct jump_entry *iter_stop, void *start, void *end, bool init) { struct jump_entry *iter; iter = iter_start; while (iter < iter_stop) { if (init || !jump_entry_is_init(iter)) { if (addr_conflict(iter, start, end)) return 1; } iter++; } return 0; } #ifndef arch_jump_label_transform_static static void arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type) { /* nothing to do on most architectures */ } #endif static inline struct jump_entry *static_key_entries(struct static_key *key) { WARN_ON_ONCE(key->type & JUMP_TYPE_LINKED); return (struct jump_entry *)(key->type & ~JUMP_TYPE_MASK); } static inline bool static_key_type(struct static_key *key) { return key->type & JUMP_TYPE_TRUE; } static inline bool static_key_linked(struct static_key *key) { return key->type & JUMP_TYPE_LINKED; } static inline void static_key_clear_linked(struct static_key *key) { key->type &= ~JUMP_TYPE_LINKED; } static inline void static_key_set_linked(struct static_key *key) { key->type |= JUMP_TYPE_LINKED; } /*** * A 'struct static_key' uses a union such that it either points directly * to a table of 'struct jump_entry' or to a linked list of modules which in * turn point to 'struct jump_entry' tables. * * The two lower bits of the pointer are used to keep track of which pointer * type is in use and to store the initial branch direction, we use an access * function which preserves these bits. */ static void static_key_set_entries(struct static_key *key, struct jump_entry *entries) { unsigned long type; WARN_ON_ONCE((unsigned long)entries & JUMP_TYPE_MASK); type = key->type & JUMP_TYPE_MASK; key->entries = entries; key->type |= type; } static enum jump_label_type jump_label_type(struct jump_entry *entry) { struct static_key *key = jump_entry_key(entry); bool enabled = static_key_enabled(key); bool branch = jump_entry_is_branch(entry); /* See the comment in linux/jump_label.h */ return enabled ^ branch; } static bool jump_label_can_update(struct jump_entry *entry, bool init) { /* * Cannot update code that was in an init text area. */ if (!init && jump_entry_is_init(entry)) return false; if (!kernel_text_address(jump_entry_code(entry))) { /* * This skips patching built-in __exit, which * is part of init_section_contains() but is * not part of kernel_text_address(). * * Skipping built-in __exit is fine since it * will never be executed. */ WARN_ONCE(!jump_entry_is_init(entry), "can't patch jump_label at %pS", (void *)jump_entry_code(entry)); return false; } return true; } #ifndef HAVE_JUMP_LABEL_BATCH static void __jump_label_update(struct static_key *key, struct jump_entry *entry, struct jump_entry *stop, bool init) { for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) { if (jump_label_can_update(entry, init)) arch_jump_label_transform(entry, jump_label_type(entry)); } } #else static void __jump_label_update(struct static_key *key, struct jump_entry *entry, struct jump_entry *stop, bool init) { for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) { if (!jump_label_can_update(entry, init)) continue; if (!arch_jump_label_transform_queue(entry, jump_label_type(entry))) { /* * Queue is full: Apply the current queue and try again. */ arch_jump_label_transform_apply(); BUG_ON(!arch_jump_label_transform_queue(entry, jump_label_type(entry))); } } arch_jump_label_transform_apply(); } #endif void __init jump_label_init(void) { struct jump_entry *iter_start = __start___jump_table; struct jump_entry *iter_stop = __stop___jump_table; struct static_key *key = NULL; struct jump_entry *iter; /* * Since we are initializing the static_key.enabled field with * with the 'raw' int values (to avoid pulling in atomic.h) in * jump_label.h, let's make sure that is safe. There are only two * cases to check since we initialize to 0 or 1. */ BUILD_BUG_ON((int)ATOMIC_INIT(0) != 0); BUILD_BUG_ON((int)ATOMIC_INIT(1) != 1); if (static_key_initialized) return; cpus_read_lock(); jump_label_lock(); jump_label_sort_entries(iter_start, iter_stop); for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk; bool in_init; /* rewrite NOPs */ if (jump_label_type(iter) == JUMP_LABEL_NOP) arch_jump_label_transform_static(iter, JUMP_LABEL_NOP); in_init = init_section_contains((void *)jump_entry_code(iter), 1); jump_entry_set_init(iter, in_init); iterk = jump_entry_key(iter); if (iterk == key) continue; key = iterk; static_key_set_entries(key, iter); } static_key_initialized = true; jump_label_unlock(); cpus_read_unlock(); } #ifdef CONFIG_MODULES enum jump_label_type jump_label_init_type(struct jump_entry *entry) { struct static_key *key = jump_entry_key(entry); bool type = static_key_type(key); bool branch = jump_entry_is_branch(entry); /* See the comment in linux/jump_label.h */ return type ^ branch; } struct static_key_mod { struct static_key_mod *next; struct jump_entry *entries; struct module *mod; }; static inline struct static_key_mod *static_key_mod(struct static_key *key) { WARN_ON_ONCE(!static_key_linked(key)); return (struct static_key_mod *)(key->type & ~JUMP_TYPE_MASK); } /*** * key->type and key->next are the same via union. * This sets key->next and preserves the type bits. * * See additional comments above static_key_set_entries(). */ static void static_key_set_mod(struct static_key *key, struct static_key_mod *mod) { unsigned long type; WARN_ON_ONCE((unsigned long)mod & JUMP_TYPE_MASK); type = key->type & JUMP_TYPE_MASK; key->next = mod; key->type |= type; } static int __jump_label_mod_text_reserved(void *start, void *end) { struct module *mod; int ret; preempt_disable(); mod = __module_text_address((unsigned long)start); WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); if (!try_module_get(mod)) mod = NULL; preempt_enable(); if (!mod) return 0; ret = __jump_label_text_reserved(mod->jump_entries, mod->jump_entries + mod->num_jump_entries, start, end, mod->state == MODULE_STATE_COMING); module_put(mod); return ret; } static void __jump_label_mod_update(struct static_key *key) { struct static_key_mod *mod; for (mod = static_key_mod(key); mod; mod = mod->next) { struct jump_entry *stop; struct module *m; /* * NULL if the static_key is defined in a module * that does not use it */ if (!mod->entries) continue; m = mod->mod; if (!m) stop = __stop___jump_table; else stop = m->jump_entries + m->num_jump_entries; __jump_label_update(key, mod->entries, stop, m && m->state == MODULE_STATE_COMING); } } static int jump_label_add_module(struct module *mod) { struct jump_entry *iter_start = mod->jump_entries; struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; struct static_key *key = NULL; struct static_key_mod *jlm, *jlm2; /* if the module doesn't have jump label entries, just return */ if (iter_start == iter_stop) return 0; jump_label_sort_entries(iter_start, iter_stop); for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk; bool in_init; in_init = within_module_init(jump_entry_code(iter), mod); jump_entry_set_init(iter, in_init); iterk = jump_entry_key(iter); if (iterk == key) continue; key = iterk; if (within_module((unsigned long)key, mod)) { static_key_set_entries(key, iter); continue; } jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL); if (!jlm) return -ENOMEM; if (!static_key_linked(key)) { jlm2 = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL); if (!jlm2) { kfree(jlm); return -ENOMEM; } preempt_disable(); jlm2->mod = __module_address((unsigned long)key); preempt_enable(); jlm2->entries = static_key_entries(key); jlm2->next = NULL; static_key_set_mod(key, jlm2); static_key_set_linked(key); } jlm->mod = mod; jlm->entries = iter; jlm->next = static_key_mod(key); static_key_set_mod(key, jlm); static_key_set_linked(key); /* Only update if we've changed from our initial state */ if (jump_label_type(iter) != jump_label_init_type(iter)) __jump_label_update(key, iter, iter_stop, true); } return 0; } static void jump_label_del_module(struct module *mod) { struct jump_entry *iter_start = mod->jump_entries; struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; struct static_key *key = NULL; struct static_key_mod *jlm, **prev; for (iter = iter_start; iter < iter_stop; iter++) { if (jump_entry_key(iter) == key) continue; key = jump_entry_key(iter); if (within_module((unsigned long)key, mod)) continue; /* No memory during module load */ if (WARN_ON(!static_key_linked(key))) continue; prev = &key->next; jlm = static_key_mod(key); while (jlm && jlm->mod != mod) { prev = &jlm->next; jlm = jlm->next; } /* No memory during module load */ if (WARN_ON(!jlm)) continue; if (prev == &key->next) static_key_set_mod(key, jlm->next); else *prev = jlm->next; kfree(jlm); jlm = static_key_mod(key); /* if only one etry is left, fold it back into the static_key */ if (jlm->next == NULL) { static_key_set_entries(key, jlm->entries); static_key_clear_linked(key); kfree(jlm); } } } static int jump_label_module_notify(struct notifier_block *self, unsigned long val, void *data) { struct module *mod = data; int ret = 0; cpus_read_lock(); jump_label_lock(); switch (val) { case MODULE_STATE_COMING: ret = jump_label_add_module(mod); if (ret) { WARN(1, "Failed to allocate memory: jump_label may not work properly.\n"); jump_label_del_module(mod); } break; case MODULE_STATE_GOING: jump_label_del_module(mod); break; } jump_label_unlock(); cpus_read_unlock(); return notifier_from_errno(ret); } static struct notifier_block jump_label_module_nb = { .notifier_call = jump_label_module_notify, .priority = 1, /* higher than tracepoints */ }; static __init int jump_label_init_module(void) { return register_module_notifier(&jump_label_module_nb); } early_initcall(jump_label_init_module); #endif /* CONFIG_MODULES */ /*** * jump_label_text_reserved - check if addr range is reserved * @start: start text addr * @end: end text addr * * checks if the text addr located between @start and @end * overlaps with any of the jump label patch addresses. Code * that wants to modify kernel text should first verify that * it does not overlap with any of the jump label addresses. * Caller must hold jump_label_mutex. * * returns 1 if there is an overlap, 0 otherwise */ int jump_label_text_reserved(void *start, void *end) { bool init = system_state < SYSTEM_RUNNING; int ret = __jump_label_text_reserved(__start___jump_table, __stop___jump_table, start, end, init); if (ret) return ret; #ifdef CONFIG_MODULES ret = __jump_label_mod_text_reserved(start, end); #endif return ret; } static void jump_label_update(struct static_key *key) { struct jump_entry *stop = __stop___jump_table; bool init = system_state < SYSTEM_RUNNING; struct jump_entry *entry; #ifdef CONFIG_MODULES struct module *mod; if (static_key_linked(key)) { __jump_label_mod_update(key); return; } preempt_disable(); mod = __module_address((unsigned long)key); if (mod) { stop = mod->jump_entries + mod->num_jump_entries; init = mod->state == MODULE_STATE_COMING; } preempt_enable(); #endif entry = static_key_entries(key); /* if there are no users, entry can be NULL */ if (entry) __jump_label_update(key, entry, stop, init); } #ifdef CONFIG_STATIC_KEYS_SELFTEST static DEFINE_STATIC_KEY_TRUE(sk_true); static DEFINE_STATIC_KEY_FALSE(sk_false); static __init int jump_label_test(void) { int i; for (i = 0; i < 2; i++) { WARN_ON(static_key_enabled(&sk_true.key) != true); WARN_ON(static_key_enabled(&sk_false.key) != false); WARN_ON(!static_branch_likely(&sk_true)); WARN_ON(!static_branch_unlikely(&sk_true)); WARN_ON(static_branch_likely(&sk_false)); WARN_ON(static_branch_unlikely(&sk_false)); static_branch_disable(&sk_true); static_branch_enable(&sk_false); WARN_ON(static_key_enabled(&sk_true.key) == true); WARN_ON(static_key_enabled(&sk_false.key) == false); WARN_ON(static_branch_likely(&sk_true)); WARN_ON(static_branch_unlikely(&sk_true)); WARN_ON(!static_branch_likely(&sk_false)); WARN_ON(!static_branch_unlikely(&sk_false)); static_branch_enable(&sk_true); static_branch_disable(&sk_false); } return 0; } early_initcall(jump_label_test); #endif /* STATIC_KEYS_SELFTEST */ |
35 35 32 3 2 32 25 7 20 14 3 4 3 14 9 4 13 3 5 8 8 8 9 8 3 6 20 20 3 9 9 9 12 7 7 3 2 10 7 3 3 6 6 1 6 2 3 6 4 7 2 9 19 12 1 6 5 1 3 3 2 10 3 1 1 5 3 5 3 32 28 32 3 28 1 3 6 1 1 2 2 1 1 1 1 3 3 3 3 3 3 5 5 33 7 34 33 34 7 7 2 3 6 2 1 1 1 3 3 3 3 3 3 3 3 9 8 8 3 26 1 26 26 5 20 5 1 24 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 | /* * hugetlbpage-backed filesystem. Based on ramfs. * * Nadia Yvette Chambers, 2002 * * Copyright (C) 2002 Linus Torvalds. * License: GPL */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/thread_info.h> #include <asm/current.h> #include <linux/falloc.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/file.h> #include <linux/kernel.h> #include <linux/writeback.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/init.h> #include <linux/string.h> #include <linux/capability.h> #include <linux/ctype.h> #include <linux/backing-dev.h> #include <linux/hugetlb.h> #include <linux/pagevec.h> #include <linux/fs_parser.h> #include <linux/mman.h> #include <linux/slab.h> #include <linux/dnotify.h> #include <linux/statfs.h> #include <linux/security.h> #include <linux/magic.h> #include <linux/migrate.h> #include <linux/uio.h> #include <linux/uaccess.h> #include <linux/sched/mm.h> static const struct address_space_operations hugetlbfs_aops; const struct file_operations hugetlbfs_file_operations; static const struct inode_operations hugetlbfs_dir_inode_operations; static const struct inode_operations hugetlbfs_inode_operations; enum hugetlbfs_size_type { NO_SIZE, SIZE_STD, SIZE_PERCENT }; struct hugetlbfs_fs_context { struct hstate *hstate; unsigned long long max_size_opt; unsigned long long min_size_opt; long max_hpages; long nr_inodes; long min_hpages; enum hugetlbfs_size_type max_val_type; enum hugetlbfs_size_type min_val_type; kuid_t uid; kgid_t gid; umode_t mode; }; int sysctl_hugetlb_shm_group; enum hugetlb_param { Opt_gid, Opt_min_size, Opt_mode, Opt_nr_inodes, Opt_pagesize, Opt_size, Opt_uid, }; static const struct fs_parameter_spec hugetlb_fs_parameters[] = { fsparam_u32 ("gid", Opt_gid), fsparam_string("min_size", Opt_min_size), fsparam_u32oct("mode", Opt_mode), fsparam_string("nr_inodes", Opt_nr_inodes), fsparam_string("pagesize", Opt_pagesize), fsparam_string("size", Opt_size), fsparam_u32 ("uid", Opt_uid), {} }; /* * Mask used when checking the page offset value passed in via system * calls. This value will be converted to a loff_t which is signed. * Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the * value. The extra bit (- 1 in the shift value) is to take the sign * bit into account. */ #define PGOFF_LOFFT_MAX \ (((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1))) static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); loff_t len, vma_len; int ret; struct hstate *h = hstate_file(file); vm_flags_t vm_flags; /* * vma address alignment (but not the pgoff alignment) has * already been checked by prepare_hugepage_range. If you add * any error returns here, do so after setting VM_HUGETLB, so * is_vm_hugetlb_page tests below unmap_region go the right * way when do_mmap unwinds (may be important on powerpc * and ia64). */ vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND); vma->vm_ops = &hugetlb_vm_ops; ret = seal_check_write(info->seals, vma); if (ret) return ret; /* * page based offset in vm_pgoff could be sufficiently large to * overflow a loff_t when converted to byte offset. This can * only happen on architectures where sizeof(loff_t) == * sizeof(unsigned long). So, only check in those instances. */ if (sizeof(unsigned long) == sizeof(loff_t)) { if (vma->vm_pgoff & PGOFF_LOFFT_MAX) return -EINVAL; } /* must be huge page aligned */ if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) return -EINVAL; vma_len = (loff_t)(vma->vm_end - vma->vm_start); len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); /* check for overflow */ if (len < vma_len) return -EINVAL; inode_lock(inode); file_accessed(file); ret = -ENOMEM; vm_flags = vma->vm_flags; /* * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip * reserving here. Note: only for SHM hugetlbfs file, the inode * flag S_PRIVATE is set. */ if (inode->i_flags & S_PRIVATE) vm_flags |= VM_NORESERVE; if (!hugetlb_reserve_pages(inode, vma->vm_pgoff >> huge_page_order(h), len >> huge_page_shift(h), vma, vm_flags)) goto out; ret = 0; if (vma->vm_flags & VM_WRITE && inode->i_size < len) i_size_write(inode, len); out: inode_unlock(inode); return ret; } /* * Called under mmap_write_lock(mm). */ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct hstate *h = hstate_file(file); struct vm_unmapped_area_info info; info.flags = 0; info.length = len; info.low_limit = current->mm->mmap_base; info.high_limit = arch_get_mmap_end(addr, len, flags); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; return vm_unmapped_area(&info); } static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct hstate *h = hstate_file(file); struct vm_unmapped_area_info info; info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; addr = vm_unmapped_area(&info); /* * A failed mmap() very likely causes application failure, * so fall back to the bottom-up function here. This scenario * can happen with large stack limits and large mmap() * allocations. */ if (unlikely(offset_in_page(addr))) { VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = current->mm->mmap_base; info.high_limit = arch_get_mmap_end(addr, len, flags); addr = vm_unmapped_area(&info); } return addr; } unsigned long generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct hstate *h = hstate_file(file); const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); if (len & ~huge_page_mask(h)) return -EINVAL; if (len > TASK_SIZE) return -ENOMEM; if (flags & MAP_FIXED) { if (prepare_hugepage_range(file, addr, len)) return -EINVAL; return addr; } if (addr) { addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); if (mmap_end - len >= addr && (!vma || addr + len <= vm_start_gap(vma))) return addr; } /* * Use mm->get_unmapped_area value as a hint to use topdown routine. * If architectures have special needs, they should define their own * version of hugetlb_get_unmapped_area. */ if (mm->get_unmapped_area == arch_get_unmapped_area_topdown) return hugetlb_get_unmapped_area_topdown(file, addr, len, pgoff, flags); return hugetlb_get_unmapped_area_bottomup(file, addr, len, pgoff, flags); } #ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA static unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { return generic_hugetlb_get_unmapped_area(file, addr, len, pgoff, flags); } #endif /* * Someone wants to read @bytes from a HWPOISON hugetlb @page from @offset. * Returns the maximum number of bytes one can read without touching the 1st raw * HWPOISON subpage. * * The implementation borrows the iteration logic from copy_page_to_iter*. */ static size_t adjust_range_hwpoison(struct page *page, size_t offset, size_t bytes) { size_t n = 0; size_t res = 0; /* First subpage to start the loop. */ page = nth_page(page, offset / PAGE_SIZE); offset %= PAGE_SIZE; while (1) { if (is_raw_hwpoison_page_in_hugepage(page)) break; /* Safe to read n bytes without touching HWPOISON subpage. */ n = min(bytes, (size_t)PAGE_SIZE - offset); res += n; bytes -= n; if (!bytes || !n) break; offset += n; if (offset == PAGE_SIZE) { page = nth_page(page, 1); offset = 0; } } return res; } /* * Support for read() - Find the page attached to f_mapping and copy out the * data. This provides functionality similar to filemap_read(). */ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct hstate *h = hstate_file(file); struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; unsigned long index = iocb->ki_pos >> huge_page_shift(h); unsigned long offset = iocb->ki_pos & ~huge_page_mask(h); unsigned long end_index; loff_t isize; ssize_t retval = 0; while (iov_iter_count(to)) { struct folio *folio; size_t nr, copied, want; /* nr is the maximum number of bytes to copy from this page */ nr = huge_page_size(h); isize = i_size_read(inode); if (!isize) break; end_index = (isize - 1) >> huge_page_shift(h); if (index > end_index) break; if (index == end_index) { nr = ((isize - 1) & ~huge_page_mask(h)) + 1; if (nr <= offset) break; } nr = nr - offset; /* Find the folio */ folio = filemap_lock_hugetlb_folio(h, mapping, index); if (IS_ERR(folio)) { /* * We have a HOLE, zero out the user-buffer for the * length of the hole or request. */ copied = iov_iter_zero(nr, to); } else { folio_unlock(folio); if (!folio_test_hwpoison(folio)) want = nr; else { /* * Adjust how many bytes safe to read without * touching the 1st raw HWPOISON subpage after * offset. */ want = adjust_range_hwpoison(&folio->page, offset, nr); if (want == 0) { folio_put(folio); retval = -EIO; break; } } /* * We have the folio, copy it to user space buffer. */ copied = copy_folio_to_iter(folio, offset, want, to); folio_put(folio); } offset += copied; retval += copied; if (copied != nr && iov_iter_count(to)) { if (!retval) retval = -EFAULT; break; } index += offset >> huge_page_shift(h); offset &= ~huge_page_mask(h); } iocb->ki_pos = ((loff_t)index << huge_page_shift(h)) + offset; return retval; } static int hugetlbfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, void **fsdata) { return -EINVAL; } static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { BUG(); return -EINVAL; } static void hugetlb_delete_from_page_cache(struct folio *folio) { folio_clear_dirty(folio); folio_clear_uptodate(folio); filemap_remove_folio(folio); } /* * Called with i_mmap_rwsem held for inode based vma maps. This makes * sure vma (and vm_mm) will not go away. We also hold the hugetlb fault * mutex for the page in the mapping. So, we can not race with page being * faulted into the vma. */ static bool hugetlb_vma_maps_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) { pte_t *ptep, pte; ptep = hugetlb_walk(vma, addr, huge_page_size(hstate_vma(vma))); if (!ptep) return false; pte = huge_ptep_get(ptep); if (huge_pte_none(pte) || !pte_present(pte)) return false; if (pte_page(pte) == page) return true; return false; } /* * Can vma_offset_start/vma_offset_end overflow on 32-bit arches? * No, because the interval tree returns us only those vmas * which overlap the truncated area starting at pgoff, * and no vma on a 32-bit arch can span beyond the 4GB. */ static unsigned long vma_offset_start(struct vm_area_struct *vma, pgoff_t start) { unsigned long offset = 0; if (vma->vm_pgoff < start) offset = (start - vma->vm_pgoff) << PAGE_SHIFT; return vma->vm_start + offset; } static unsigned long vma_offset_end(struct vm_area_struct *vma, pgoff_t end) { unsigned long t_end; if (!end) return vma->vm_end; t_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) + vma->vm_start; if (t_end > vma->vm_end) t_end = vma->vm_end; return t_end; } /* * Called with hugetlb fault mutex held. Therefore, no more mappings to * this folio can be created while executing the routine. */ static void hugetlb_unmap_file_folio(struct hstate *h, struct address_space *mapping, struct folio *folio, pgoff_t index) { struct rb_root_cached *root = &mapping->i_mmap; struct hugetlb_vma_lock *vma_lock; struct page *page = &folio->page; struct vm_area_struct *vma; unsigned long v_start; unsigned long v_end; pgoff_t start, end; start = index * pages_per_huge_page(h); end = (index + 1) * pages_per_huge_page(h); i_mmap_lock_write(mapping); retry: vma_lock = NULL; vma_interval_tree_foreach(vma, root, start, end - 1) { v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); if (!hugetlb_vma_maps_page(vma, v_start, page)) continue; if (!hugetlb_vma_trylock_write(vma)) { vma_lock = vma->vm_private_data; /* * If we can not get vma lock, we need to drop * immap_sema and take locks in order. First, * take a ref on the vma_lock structure so that * we can be guaranteed it will not go away when * dropping immap_sema. */ kref_get(&vma_lock->refs); break; } unmap_hugepage_range(vma, v_start, v_end, NULL, ZAP_FLAG_DROP_MARKER); hugetlb_vma_unlock_write(vma); } i_mmap_unlock_write(mapping); if (vma_lock) { /* * Wait on vma_lock. We know it is still valid as we have * a reference. We must 'open code' vma locking as we do * not know if vma_lock is still attached to vma. */ down_write(&vma_lock->rw_sema); i_mmap_lock_write(mapping); vma = vma_lock->vma; if (!vma) { /* * If lock is no longer attached to vma, then just * unlock, drop our reference and retry looking for * other vmas. */ up_write(&vma_lock->rw_sema); kref_put(&vma_lock->refs, hugetlb_vma_lock_release); goto retry; } /* * vma_lock is still attached to vma. Check to see if vma * still maps page and if so, unmap. */ v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); if (hugetlb_vma_maps_page(vma, v_start, page)) unmap_hugepage_range(vma, v_start, v_end, NULL, ZAP_FLAG_DROP_MARKER); kref_put(&vma_lock->refs, hugetlb_vma_lock_release); hugetlb_vma_unlock_write(vma); goto retry; } } static void hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, zap_flags_t zap_flags) { struct vm_area_struct *vma; /* * end == 0 indicates that the entire range after start should be * unmapped. Note, end is exclusive, whereas the interval tree takes * an inclusive "last". */ vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) { unsigned long v_start; unsigned long v_end; if (!hugetlb_vma_trylock_write(vma)) continue; v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); unmap_hugepage_range(vma, v_start, v_end, NULL, zap_flags); /* * Note that vma lock only exists for shared/non-private * vmas. Therefore, lock is not held when calling * unmap_hugepage_range for private vmas. */ hugetlb_vma_unlock_write(vma); } } /* * Called with hugetlb fault mutex held. * Returns true if page was actually removed, false otherwise. */ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode, struct address_space *mapping, struct folio *folio, pgoff_t index, bool truncate_op) { bool ret = false; /* * If folio is mapped, it was faulted in after being * unmapped in caller. Unmap (again) while holding * the fault mutex. The mutex will prevent faults * until we finish removing the folio. */ if (unlikely(folio_mapped(folio))) hugetlb_unmap_file_folio(h, mapping, folio, index); folio_lock(folio); /* * We must remove the folio from page cache before removing * the region/ reserve map (hugetlb_unreserve_pages). In * rare out of memory conditions, removal of the region/reserve * map could fail. Correspondingly, the subpool and global * reserve usage count can need to be adjusted. */ VM_BUG_ON_FOLIO(folio_test_hugetlb_restore_reserve(folio), folio); hugetlb_delete_from_page_cache(folio); ret = true; if (!truncate_op) { if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1))) hugetlb_fix_reserve_counts(inode); } folio_unlock(folio); return ret; } /* * remove_inode_hugepages handles two distinct cases: truncation and hole * punch. There are subtle differences in operation for each case. * * truncation is indicated by end of range being LLONG_MAX * In this case, we first scan the range and release found pages. * After releasing pages, hugetlb_unreserve_pages cleans up region/reserve * maps and global counts. Page faults can race with truncation. * During faults, hugetlb_no_page() checks i_size before page allocation, * and again after obtaining page table lock. It will 'back out' * allocations in the truncated range. * hole punch is indicated if end is not LLONG_MAX * In the hole punch case we scan the range and release found pages. * Only when releasing a page is the associated region/reserve map * deleted. The region/reserve map for ranges without associated * pages are not modified. Page faults can race with hole punch. * This is indicated if we find a mapped page. * Note: If the passed end of range value is beyond the end of file, but * not LLONG_MAX this routine still performs a hole punch operation. */ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, loff_t lend) { struct hstate *h = hstate_inode(inode); struct address_space *mapping = &inode->i_data; const pgoff_t end = lend >> PAGE_SHIFT; struct folio_batch fbatch; pgoff_t next, index; int i, freed = 0; bool truncate_op = (lend == LLONG_MAX); folio_batch_init(&fbatch); next = lstart >> PAGE_SHIFT; while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) { for (i = 0; i < folio_batch_count(&fbatch); ++i) { struct folio *folio = fbatch.folios[i]; u32 hash = 0; index = folio->index >> huge_page_order(h); hash = hugetlb_fault_mutex_hash(mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* * Remove folio that was part of folio_batch. */ if (remove_inode_single_folio(h, inode, mapping, folio, index, truncate_op)) freed++; mutex_unlock(&hugetlb_fault_mutex_table[hash]); } folio_batch_release(&fbatch); cond_resched(); } if (truncate_op) (void)hugetlb_unreserve_pages(inode, lstart >> huge_page_shift(h), LONG_MAX, freed); } static void hugetlbfs_evict_inode(struct inode *inode) { struct resv_map *resv_map; remove_inode_hugepages(inode, 0, LLONG_MAX); /* * Get the resv_map from the address space embedded in the inode. * This is the address space which points to any resv_map allocated * at inode creation time. If this is a device special inode, * i_mapping may not point to the original address space. */ resv_map = (struct resv_map *)(&inode->i_data)->i_private_data; /* Only regular and link inodes have associated reserve maps */ if (resv_map) resv_map_release(&resv_map->refs); clear_inode(inode); } static void hugetlb_vmtruncate(struct inode *inode, loff_t offset) { pgoff_t pgoff; struct address_space *mapping = inode->i_mapping; struct hstate *h = hstate_inode(inode); BUG_ON(offset & ~huge_page_mask(h)); pgoff = offset >> PAGE_SHIFT; i_size_write(inode, offset); i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0, ZAP_FLAG_DROP_MARKER); i_mmap_unlock_write(mapping); remove_inode_hugepages(inode, offset, LLONG_MAX); } static void hugetlbfs_zero_partial_page(struct hstate *h, struct address_space *mapping, loff_t start, loff_t end) { pgoff_t idx = start >> huge_page_shift(h); struct folio *folio; folio = filemap_lock_hugetlb_folio(h, mapping, idx); if (IS_ERR(folio)) return; start = start & ~huge_page_mask(h); end = end & ~huge_page_mask(h); if (!end) end = huge_page_size(h); folio_zero_segment(folio, (size_t)start, (size_t)end); folio_unlock(folio); folio_put(folio); } static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) { struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); struct address_space *mapping = inode->i_mapping; struct hstate *h = hstate_inode(inode); loff_t hpage_size = huge_page_size(h); loff_t hole_start, hole_end; /* * hole_start and hole_end indicate the full pages within the hole. */ hole_start = round_up(offset, hpage_size); hole_end = round_down(offset + len, hpage_size); inode_lock(inode); /* protected by i_rwsem */ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { inode_unlock(inode); return -EPERM; } i_mmap_lock_write(mapping); /* If range starts before first full page, zero partial page. */ if (offset < hole_start) hugetlbfs_zero_partial_page(h, mapping, offset, min(offset + len, hole_start)); /* Unmap users of full pages in the hole. */ if (hole_end > hole_start) { if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, hole_start >> PAGE_SHIFT, hole_end >> PAGE_SHIFT, 0); } /* If range extends beyond last full page, zero partial page. */ if ((offset + len) > hole_end && (offset + len) > hole_start) hugetlbfs_zero_partial_page(h, mapping, hole_end, offset + len); i_mmap_unlock_write(mapping); /* Remove full pages from the file. */ if (hole_end > hole_start) remove_inode_hugepages(inode, hole_start, hole_end); inode_unlock(inode); return 0; } static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); struct address_space *mapping = inode->i_mapping; struct hstate *h = hstate_inode(inode); struct vm_area_struct pseudo_vma; struct mm_struct *mm = current->mm; loff_t hpage_size = huge_page_size(h); unsigned long hpage_shift = huge_page_shift(h); pgoff_t start, index, end; int error; u32 hash; if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) return hugetlbfs_punch_hole(inode, offset, len); /* * Default preallocate case. * For this range, start is rounded down and end is rounded up * as well as being converted to page offsets. */ start = offset >> hpage_shift; end = (offset + len + hpage_size - 1) >> hpage_shift; inode_lock(inode); /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ error = inode_newsize_ok(inode, offset + len); if (error) goto out; if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { error = -EPERM; goto out; } /* * Initialize a pseudo vma as this is required by the huge page * allocation routines. */ vma_init(&pseudo_vma, mm); vm_flags_init(&pseudo_vma, VM_HUGETLB | VM_MAYSHARE | VM_SHARED); pseudo_vma.vm_file = file; for (index = start; index < end; index++) { /* * This is supposed to be the vaddr where the page is being * faulted in, but we have no vaddr here. */ struct folio *folio; unsigned long addr; cond_resched(); /* * fallocate(2) manpage permits EINTR; we may have been * interrupted because we are using up too much memory. */ if (signal_pending(current)) { error = -EINTR; break; } /* addr is the offset within the file (zero based) */ addr = index * hpage_size; /* mutex taken here, fault path and hole punch */ hash = hugetlb_fault_mutex_hash(mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* See if already present in mapping to avoid alloc/free */ folio = filemap_get_folio(mapping, index << huge_page_order(h)); if (!IS_ERR(folio)) { folio_put(folio); mutex_unlock(&hugetlb_fault_mutex_table[hash]); continue; } /* * Allocate folio without setting the avoid_reserve argument. * There certainly are no reserves associated with the * pseudo_vma. However, there could be shared mappings with * reserves for the file at the inode level. If we fallocate * folios in these areas, we need to consume the reserves * to keep reservation accounting consistent. */ folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0); if (IS_ERR(folio)) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); error = PTR_ERR(folio); goto out; } clear_huge_page(&folio->page, addr, pages_per_huge_page(h)); __folio_mark_uptodate(folio); error = hugetlb_add_to_page_cache(folio, mapping, index); if (unlikely(error)) { restore_reserve_on_error(h, &pseudo_vma, addr, folio); folio_put(folio); mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out; } mutex_unlock(&hugetlb_fault_mutex_table[hash]); folio_set_hugetlb_migratable(folio); /* * folio_unlock because locked by hugetlb_add_to_page_cache() * folio_put() due to reference from alloc_hugetlb_folio() */ folio_unlock(folio); folio_put(folio); } if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) i_size_write(inode, offset + len); inode_set_ctime_current(inode); out: inode_unlock(inode); return error; } static int hugetlbfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct hstate *h = hstate_inode(inode); int error; unsigned int ia_valid = attr->ia_valid; struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); error = setattr_prepare(idmap, dentry, attr); if (error) return error; if (ia_valid & ATTR_SIZE) { loff_t oldsize = inode->i_size; loff_t newsize = attr->ia_size; if (newsize & ~huge_page_mask(h)) return -EINVAL; /* protected by i_rwsem */ if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || (newsize > oldsize && (info->seals & F_SEAL_GROW))) return -EPERM; hugetlb_vmtruncate(inode, newsize); } setattr_copy(idmap, inode, attr); mark_inode_dirty(inode); return 0; } static struct inode *hugetlbfs_get_root(struct super_block *sb, struct hugetlbfs_fs_context *ctx) { struct inode *inode; inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); inode->i_mode = S_IFDIR | ctx->mode; inode->i_uid = ctx->uid; inode->i_gid = ctx->gid; simple_inode_init_ts(inode); inode->i_op = &hugetlbfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); lockdep_annotate_inode_mutex_key(inode); } return inode; } /* * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never * be taken from reclaim -- unlike regular filesystems. This needs an * annotation because huge_pmd_share() does an allocation under hugetlb's * i_mmap_rwsem. */ static struct lock_class_key hugetlbfs_i_mmap_rwsem_key; static struct inode *hugetlbfs_get_inode(struct super_block *sb, struct mnt_idmap *idmap, struct inode *dir, umode_t mode, dev_t dev) { struct inode *inode; struct resv_map *resv_map = NULL; /* * Reserve maps are only needed for inodes that can have associated * page allocations. */ if (S_ISREG(mode) || S_ISLNK(mode)) { resv_map = resv_map_alloc(); if (!resv_map) return NULL; } inode = new_inode(sb); if (inode) { struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); inode->i_ino = get_next_ino(); inode_init_owner(idmap, inode, dir, mode); lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, &hugetlbfs_i_mmap_rwsem_key); inode->i_mapping->a_ops = &hugetlbfs_aops; simple_inode_init_ts(inode); inode->i_mapping->i_private_data = resv_map; info->seals = F_SEAL_SEAL; switch (mode & S_IFMT) { default: init_special_inode(inode, mode, dev); break; case S_IFREG: inode->i_op = &hugetlbfs_inode_operations; inode->i_fop = &hugetlbfs_file_operations; break; case S_IFDIR: inode->i_op = &hugetlbfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); break; case S_IFLNK: inode->i_op = &page_symlink_inode_operations; inode_nohighmem(inode); break; } lockdep_annotate_inode_mutex_key(inode); } else { if (resv_map) kref_put(&resv_map->refs, resv_map_release); } return inode; } /* * File creation. Allocate an inode, and we're done.. */ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode, dev); if (!inode) return -ENOSPC; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); d_instantiate(dentry, inode); dget(dentry);/* Extra count - pin the dentry in core */ return 0; } static int hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { int retval = hugetlbfs_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); if (!retval) inc_nlink(dir); return retval; } static int hugetlbfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return hugetlbfs_mknod(idmap, dir, dentry, mode | S_IFREG, 0); } static int hugetlbfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct inode *inode; inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode | S_IFREG, 0); if (!inode) return -ENOSPC; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); d_tmpfile(file, inode); return finish_open_simple(file, 0); } static int hugetlbfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { const umode_t mode = S_IFLNK|S_IRWXUGO; struct inode *inode; int error = -ENOSPC; inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode, 0); if (inode) { int l = strlen(symname)+1; error = page_symlink(inode, symname, l); if (!error) { d_instantiate(dentry, inode); dget(dentry); } else iput(inode); } inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); return error; } #ifdef CONFIG_MIGRATION static int hugetlbfs_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { int rc; rc = migrate_huge_page_move_mapping(mapping, dst, src); if (rc != MIGRATEPAGE_SUCCESS) return rc; if (hugetlb_folio_subpool(src)) { hugetlb_set_folio_subpool(dst, hugetlb_folio_subpool(src)); hugetlb_set_folio_subpool(src, NULL); } if (mode != MIGRATE_SYNC_NO_COPY) folio_migrate_copy(dst, src); else folio_migrate_flags(dst, src); return MIGRATEPAGE_SUCCESS; } #else #define hugetlbfs_migrate_folio NULL #endif static int hugetlbfs_error_remove_folio(struct address_space *mapping, struct folio *folio) { return 0; } /* * Display the mount options in /proc/mounts. */ static int hugetlbfs_show_options(struct seq_file *m, struct dentry *root) { struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(root->d_sb); struct hugepage_subpool *spool = sbinfo->spool; unsigned long hpage_size = huge_page_size(sbinfo->hstate); unsigned hpage_shift = huge_page_shift(sbinfo->hstate); char mod; if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID)) seq_printf(m, ",uid=%u", from_kuid_munged(&init_user_ns, sbinfo->uid)); if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) seq_printf(m, ",gid=%u", from_kgid_munged(&init_user_ns, sbinfo->gid)); if (sbinfo->mode != 0755) seq_printf(m, ",mode=%o", sbinfo->mode); if (sbinfo->max_inodes != -1) seq_printf(m, ",nr_inodes=%lu", sbinfo->max_inodes); hpage_size /= 1024; mod = 'K'; if (hpage_size >= 1024) { hpage_size /= 1024; mod = 'M'; } seq_printf(m, ",pagesize=%lu%c", hpage_size, mod); if (spool) { if (spool->max_hpages != -1) seq_printf(m, ",size=%llu", (unsigned long long)spool->max_hpages << hpage_shift); if (spool->min_hpages != -1) seq_printf(m, ",min_size=%llu", (unsigned long long)spool->min_hpages << hpage_shift); } return 0; } static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); struct hstate *h = hstate_inode(d_inode(dentry)); u64 id = huge_encode_dev(dentry->d_sb->s_dev); buf->f_fsid = u64_to_fsid(id); buf->f_type = HUGETLBFS_MAGIC; buf->f_bsize = huge_page_size(h); if (sbinfo) { spin_lock(&sbinfo->stat_lock); /* If no limits set, just report 0 or -1 for max/free/used * blocks, like simple_statfs() */ if (sbinfo->spool) { long free_pages; spin_lock_irq(&sbinfo->spool->lock); buf->f_blocks = sbinfo->spool->max_hpages; free_pages = sbinfo->spool->max_hpages - sbinfo->spool->used_hpages; buf->f_bavail = buf->f_bfree = free_pages; spin_unlock_irq(&sbinfo->spool->lock); buf->f_files = sbinfo->max_inodes; buf->f_ffree = sbinfo->free_inodes; } spin_unlock(&sbinfo->stat_lock); } buf->f_namelen = NAME_MAX; return 0; } static void hugetlbfs_put_super(struct super_block *sb) { struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb); if (sbi) { sb->s_fs_info = NULL; if (sbi->spool) hugepage_put_subpool(sbi->spool); kfree(sbi); } } static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo) { if (sbinfo->free_inodes >= 0) { spin_lock(&sbinfo->stat_lock); if (unlikely(!sbinfo->free_inodes)) { spin_unlock(&sbinfo->stat_lock); return 0; } sbinfo->free_inodes--; spin_unlock(&sbinfo->stat_lock); } return 1; } static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo) { if (sbinfo->free_inodes >= 0) { spin_lock(&sbinfo->stat_lock); sbinfo->free_inodes++; spin_unlock(&sbinfo->stat_lock); } } static struct kmem_cache *hugetlbfs_inode_cachep; static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) { struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb); struct hugetlbfs_inode_info *p; if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo))) return NULL; p = alloc_inode_sb(sb, hugetlbfs_inode_cachep, GFP_KERNEL); if (unlikely(!p)) { hugetlbfs_inc_free_inodes(sbinfo); return NULL; } return &p->vfs_inode; } static void hugetlbfs_free_inode(struct inode *inode) { kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); } static void hugetlbfs_destroy_inode(struct inode *inode) { hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); } static const struct address_space_operations hugetlbfs_aops = { .write_begin = hugetlbfs_write_begin, .write_end = hugetlbfs_write_end, .dirty_folio = noop_dirty_folio, .migrate_folio = hugetlbfs_migrate_folio, .error_remove_folio = hugetlbfs_error_remove_folio, }; static void init_once(void *foo) { struct hugetlbfs_inode_info *ei = foo; inode_init_once(&ei->vfs_inode); } const struct file_operations hugetlbfs_file_operations = { .read_iter = hugetlbfs_read_iter, .mmap = hugetlbfs_file_mmap, .fsync = noop_fsync, .get_unmapped_area = hugetlb_get_unmapped_area, .llseek = default_llseek, .fallocate = hugetlbfs_fallocate, }; static const struct inode_operations hugetlbfs_dir_inode_operations = { .create = hugetlbfs_create, .lookup = simple_lookup, .link = simple_link, .unlink = simple_unlink, .symlink = hugetlbfs_symlink, .mkdir = hugetlbfs_mkdir, .rmdir = simple_rmdir, .mknod = hugetlbfs_mknod, .rename = simple_rename, .setattr = hugetlbfs_setattr, .tmpfile = hugetlbfs_tmpfile, }; static const struct inode_operations hugetlbfs_inode_operations = { .setattr = hugetlbfs_setattr, }; static const struct super_operations hugetlbfs_ops = { .alloc_inode = hugetlbfs_alloc_inode, .free_inode = hugetlbfs_free_inode, .destroy_inode = hugetlbfs_destroy_inode, .evict_inode = hugetlbfs_evict_inode, .statfs = hugetlbfs_statfs, .put_super = hugetlbfs_put_super, .show_options = hugetlbfs_show_options, }; /* * Convert size option passed from command line to number of huge pages * in the pool specified by hstate. Size option could be in bytes * (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT). */ static long hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt, enum hugetlbfs_size_type val_type) { if (val_type == NO_SIZE) return -1; if (val_type == SIZE_PERCENT) { size_opt <<= huge_page_shift(h); size_opt *= h->max_huge_pages; do_div(size_opt, 100); } size_opt >>= huge_page_shift(h); return size_opt; } /* * Parse one mount parameter. */ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct hugetlbfs_fs_context *ctx = fc->fs_private; struct fs_parse_result result; struct hstate *h; char *rest; unsigned long ps; int opt; opt = fs_parse(fc, hugetlb_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_uid: ctx->uid = make_kuid(current_user_ns(), result.uint_32); if (!uid_valid(ctx->uid)) goto bad_val; return 0; case Opt_gid: ctx->gid = make_kgid(current_user_ns(), result.uint_32); if (!gid_valid(ctx->gid)) goto bad_val; return 0; case Opt_mode: ctx->mode = result.uint_32 & 01777U; return 0; case Opt_size: /* memparse() will accept a K/M/G without a digit */ if (!param->string || !isdigit(param->string[0])) goto bad_val; ctx->max_size_opt = memparse(param->string, &rest); ctx->max_val_type = SIZE_STD; if (*rest == '%') ctx->max_val_type = SIZE_PERCENT; return 0; case Opt_nr_inodes: /* memparse() will accept a K/M/G without a digit */ if (!param->string || !isdigit(param->string[0])) goto bad_val; ctx->nr_inodes = memparse(param->string, &rest); return 0; case Opt_pagesize: ps = memparse(param->string, &rest); h = size_to_hstate(ps); if (!h) { pr_err("Unsupported page size %lu MB\n", ps / SZ_1M); return -EINVAL; } ctx->hstate = h; return 0; case Opt_min_size: /* memparse() will accept a K/M/G without a digit */ if (!param->string || !isdigit(param->string[0])) goto bad_val; ctx->min_size_opt = memparse(param->string, &rest); ctx->min_val_type = SIZE_STD; if (*rest == '%') ctx->min_val_type = SIZE_PERCENT; return 0; default: return -EINVAL; } bad_val: return invalfc(fc, "Bad value '%s' for mount option '%s'\n", param->string, param->key); } /* * Validate the parsed options. */ static int hugetlbfs_validate(struct fs_context *fc) { struct hugetlbfs_fs_context *ctx = fc->fs_private; /* * Use huge page pool size (in hstate) to convert the size * options to number of huge pages. If NO_SIZE, -1 is returned. */ ctx->max_hpages = hugetlbfs_size_to_hpages(ctx->hstate, ctx->max_size_opt, ctx->max_val_type); ctx->min_hpages = hugetlbfs_size_to_hpages(ctx->hstate, ctx->min_size_opt, ctx->min_val_type); /* * If max_size was specified, then min_size must be smaller */ if (ctx->max_val_type > NO_SIZE && ctx->min_hpages > ctx->max_hpages) { pr_err("Minimum size can not be greater than maximum size\n"); return -EINVAL; } return 0; } static int hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct hugetlbfs_fs_context *ctx = fc->fs_private; struct hugetlbfs_sb_info *sbinfo; sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL); if (!sbinfo) return -ENOMEM; sb->s_fs_info = sbinfo; spin_lock_init(&sbinfo->stat_lock); sbinfo->hstate = ctx->hstate; sbinfo->max_inodes = ctx->nr_inodes; sbinfo->free_inodes = ctx->nr_inodes; sbinfo->spool = NULL; sbinfo->uid = ctx->uid; sbinfo->gid = ctx->gid; sbinfo->mode = ctx->mode; /* * Allocate and initialize subpool if maximum or minimum size is * specified. Any needed reservations (for minimum size) are taken * when the subpool is created. */ if (ctx->max_hpages != -1 || ctx->min_hpages != -1) { sbinfo->spool = hugepage_new_subpool(ctx->hstate, ctx->max_hpages, ctx->min_hpages); if (!sbinfo->spool) goto out_free; } sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = huge_page_size(ctx->hstate); sb->s_blocksize_bits = huge_page_shift(ctx->hstate); sb->s_magic = HUGETLBFS_MAGIC; sb->s_op = &hugetlbfs_ops; sb->s_time_gran = 1; /* * Due to the special and limited functionality of hugetlbfs, it does * not work well as a stacking filesystem. */ sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; sb->s_root = d_make_root(hugetlbfs_get_root(sb, ctx)); if (!sb->s_root) goto out_free; return 0; out_free: kfree(sbinfo->spool); kfree(sbinfo); return -ENOMEM; } static int hugetlbfs_get_tree(struct fs_context *fc) { int err = hugetlbfs_validate(fc); if (err) return err; return get_tree_nodev(fc, hugetlbfs_fill_super); } static void hugetlbfs_fs_context_free(struct fs_context *fc) { kfree(fc->fs_private); } static const struct fs_context_operations hugetlbfs_fs_context_ops = { .free = hugetlbfs_fs_context_free, .parse_param = hugetlbfs_parse_param, .get_tree = hugetlbfs_get_tree, }; static int hugetlbfs_init_fs_context(struct fs_context *fc) { struct hugetlbfs_fs_context *ctx; ctx = kzalloc(sizeof(struct hugetlbfs_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->max_hpages = -1; /* No limit on size by default */ ctx->nr_inodes = -1; /* No limit on number of inodes by default */ ctx->uid = current_fsuid(); ctx->gid = current_fsgid(); ctx->mode = 0755; ctx->hstate = &default_hstate; ctx->min_hpages = -1; /* No default minimum size */ ctx->max_val_type = NO_SIZE; ctx->min_val_type = NO_SIZE; fc->fs_private = ctx; fc->ops = &hugetlbfs_fs_context_ops; return 0; } static struct file_system_type hugetlbfs_fs_type = { .name = "hugetlbfs", .init_fs_context = hugetlbfs_init_fs_context, .parameters = hugetlb_fs_parameters, .kill_sb = kill_litter_super, .fs_flags = FS_ALLOW_IDMAP, }; static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; static int can_do_hugetlb_shm(void) { kgid_t shm_group; shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group); return capable(CAP_IPC_LOCK) || in_group_p(shm_group); } static int get_hstate_idx(int page_size_log) { struct hstate *h = hstate_sizelog(page_size_log); if (!h) return -1; return hstate_index(h); } /* * Note that size should be aligned to proper hugepage size in caller side, * otherwise hugetlb_reserve_pages reserves one less hugepages than intended. */ struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag, int creat_flags, int page_size_log) { struct inode *inode; struct vfsmount *mnt; int hstate_idx; struct file *file; hstate_idx = get_hstate_idx(page_size_log); if (hstate_idx < 0) return ERR_PTR(-ENODEV); mnt = hugetlbfs_vfsmount[hstate_idx]; if (!mnt) return ERR_PTR(-ENOENT); if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { struct ucounts *ucounts = current_ucounts(); if (user_shm_lock(size, ucounts)) { pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is obsolete\n", current->comm, current->pid); user_shm_unlock(size, ucounts); } return ERR_PTR(-EPERM); } file = ERR_PTR(-ENOSPC); /* hugetlbfs_vfsmount[] mounts do not use idmapped mounts. */ inode = hugetlbfs_get_inode(mnt->mnt_sb, &nop_mnt_idmap, NULL, S_IFREG | S_IRWXUGO, 0); if (!inode) goto out; if (creat_flags == HUGETLB_SHMFS_INODE) inode->i_flags |= S_PRIVATE; inode->i_size = size; clear_nlink(inode); if (!hugetlb_reserve_pages(inode, 0, size >> huge_page_shift(hstate_inode(inode)), NULL, acctflag)) file = ERR_PTR(-ENOMEM); else file = alloc_file_pseudo(inode, mnt, name, O_RDWR, &hugetlbfs_file_operations); if (!IS_ERR(file)) return file; iput(inode); out: return file; } static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h) { struct fs_context *fc; struct vfsmount *mnt; fc = fs_context_for_mount(&hugetlbfs_fs_type, SB_KERNMOUNT); if (IS_ERR(fc)) { mnt = ERR_CAST(fc); } else { struct hugetlbfs_fs_context *ctx = fc->fs_private; ctx->hstate = h; mnt = fc_mount(fc); put_fs_context(fc); } if (IS_ERR(mnt)) pr_err("Cannot mount internal hugetlbfs for page size %luK", huge_page_size(h) / SZ_1K); return mnt; } static int __init init_hugetlbfs_fs(void) { struct vfsmount *mnt; struct hstate *h; int error; int i; if (!hugepages_supported()) { pr_info("disabling because there are no supported hugepage sizes\n"); return -ENOTSUPP; } error = -ENOMEM; hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", sizeof(struct hugetlbfs_inode_info), 0, SLAB_ACCOUNT, init_once); if (hugetlbfs_inode_cachep == NULL) goto out; error = register_filesystem(&hugetlbfs_fs_type); if (error) goto out_free; /* default hstate mount is required */ mnt = mount_one_hugetlbfs(&default_hstate); if (IS_ERR(mnt)) { error = PTR_ERR(mnt); goto out_unreg; } hugetlbfs_vfsmount[default_hstate_idx] = mnt; /* other hstates are optional */ i = 0; for_each_hstate(h) { if (i == default_hstate_idx) { i++; continue; } mnt = mount_one_hugetlbfs(h); if (IS_ERR(mnt)) hugetlbfs_vfsmount[i] = NULL; else hugetlbfs_vfsmount[i] = mnt; i++; } return 0; out_unreg: (void)unregister_filesystem(&hugetlbfs_fs_type); out_free: kmem_cache_destroy(hugetlbfs_inode_cachep); out: return error; } fs_initcall(init_hugetlbfs_fs) |
1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 | // SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/backing-dev.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/pagevec.h> #include <linux/task_io_accounting_ops.h> #include <linux/signal.h> #include <linux/iversion.h> #include <linux/ktime.h> #include <linux/netfs.h> #include "super.h" #include "mds_client.h" #include "cache.h" #include "metric.h" #include "crypto.h" #include <linux/ceph/osd_client.h> #include <linux/ceph/striper.h> /* * Ceph address space ops. * * There are a few funny things going on here. * * The page->private field is used to reference a struct * ceph_snap_context for _every_ dirty page. This indicates which * snapshot the page was logically dirtied in, and thus which snap * context needs to be associated with the osd write during writeback. * * Similarly, struct ceph_inode_info maintains a set of counters to * count dirty pages on the inode. In the absence of snapshots, * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count. * * When a snapshot is taken (that is, when the client receives * notification that a snapshot was taken), each inode with caps and * with dirty pages (dirty pages implies there is a cap) gets a new * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending * order, new snaps go to the tail). The i_wrbuffer_ref_head count is * moved to capsnap->dirty. (Unless a sync write is currently in * progress. In that case, the capsnap is said to be "pending", new * writes cannot start, and the capsnap isn't "finalized" until the * write completes (or fails) and a final size/mtime for the inode for * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0. * * On writeback, we must submit writes to the osd IN SNAP ORDER. So, * we look for the first capsnap in i_cap_snaps and write out pages in * that snap context _only_. Then we move on to the next capsnap, * eventually reaching the "live" or "head" context (i.e., pages that * are not yet snapped) and are writing the most recently dirtied * pages. * * Invalidate and so forth must take care to ensure the dirty page * accounting is preserved. */ #define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10)) #define CONGESTION_OFF_THRESH(congestion_kb) \ (CONGESTION_ON_THRESH(congestion_kb) - \ (CONGESTION_ON_THRESH(congestion_kb) >> 2)) static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len, struct folio **foliop, void **_fsdata); static inline struct ceph_snap_context *page_snap_context(struct page *page) { if (PagePrivate(page)) return (void *)page->private; return NULL; } /* * Dirty a page. Optimistically adjust accounting, on the assumption * that we won't race with invalidate. If we do, readjust. */ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio) { struct inode *inode = mapping->host; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci; struct ceph_snap_context *snapc; if (folio_test_dirty(folio)) { doutc(cl, "%llx.%llx %p idx %lu -- already dirty\n", ceph_vinop(inode), folio, folio->index); VM_BUG_ON_FOLIO(!folio_test_private(folio), folio); return false; } ci = ceph_inode(inode); /* dirty the head */ spin_lock(&ci->i_ceph_lock); BUG_ON(ci->i_wr_ref == 0); // caller should hold Fw reference if (__ceph_have_pending_cap_snap(ci)) { struct ceph_cap_snap *capsnap = list_last_entry(&ci->i_cap_snaps, struct ceph_cap_snap, ci_item); snapc = ceph_get_snap_context(capsnap->context); capsnap->dirty_pages++; } else { BUG_ON(!ci->i_head_snapc); snapc = ceph_get_snap_context(ci->i_head_snapc); ++ci->i_wrbuffer_ref_head; } if (ci->i_wrbuffer_ref == 0) ihold(inode); ++ci->i_wrbuffer_ref; doutc(cl, "%llx.%llx %p idx %lu head %d/%d -> %d/%d " "snapc %p seq %lld (%d snaps)\n", ceph_vinop(inode), folio, folio->index, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1, ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, snapc, snapc->seq, snapc->num_snaps); spin_unlock(&ci->i_ceph_lock); /* * Reference snap context in folio->private. Also set * PagePrivate so that we get invalidate_folio callback. */ VM_WARN_ON_FOLIO(folio->private, folio); folio_attach_private(folio, snapc); return ceph_fscache_dirty_folio(mapping, folio); } /* * If we are truncating the full folio (i.e. offset == 0), adjust the * dirty folio counters appropriately. Only called if there is private * data on the folio. */ static void ceph_invalidate_folio(struct folio *folio, size_t offset, size_t length) { struct inode *inode = folio->mapping->host; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_snap_context *snapc; if (offset != 0 || length != folio_size(folio)) { doutc(cl, "%llx.%llx idx %lu partial dirty page %zu~%zu\n", ceph_vinop(inode), folio->index, offset, length); return; } WARN_ON(!folio_test_locked(folio)); if (folio_test_private(folio)) { doutc(cl, "%llx.%llx idx %lu full dirty page\n", ceph_vinop(inode), folio->index); snapc = folio_detach_private(folio); ceph_put_wrbuffer_cap_refs(ci, 1, snapc); ceph_put_snap_context(snapc); } netfs_invalidate_folio(folio, offset, length); } static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq) { struct inode *inode = rreq->inode; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_layout *lo = &ci->i_layout; unsigned long max_pages = inode->i_sb->s_bdi->ra_pages; loff_t end = rreq->start + rreq->len, new_end; struct ceph_netfs_request_data *priv = rreq->netfs_priv; unsigned long max_len; u32 blockoff; if (priv) { /* Readahead is disabled by posix_fadvise POSIX_FADV_RANDOM */ if (priv->file_ra_disabled) max_pages = 0; else max_pages = priv->file_ra_pages; } /* Readahead is disabled */ if (!max_pages) return; max_len = max_pages << PAGE_SHIFT; /* * Try to expand the length forward by rounding up it to the next * block, but do not exceed the file size, unless the original * request already exceeds it. */ new_end = min(round_up(end, lo->stripe_unit), rreq->i_size); if (new_end > end && new_end <= rreq->start + max_len) rreq->len = new_end - rreq->start; /* Try to expand the start downward */ div_u64_rem(rreq->start, lo->stripe_unit, &blockoff); if (rreq->len + blockoff <= max_len) { rreq->start -= blockoff; rreq->len += blockoff; } } static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq) { struct inode *inode = subreq->rreq->inode; struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u64 objno, objoff; u32 xlen; /* Truncate the extent at the end of the current block */ ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len, &objno, &objoff, &xlen); subreq->len = min(xlen, fsc->mount_options->rsize); return true; } static void finish_netfs_read(struct ceph_osd_request *req) { struct inode *inode = req->r_inode; struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_client *cl = fsc->client; struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); struct netfs_io_subrequest *subreq = req->r_priv; struct ceph_osd_req_op *op = &req->r_ops[0]; int err = req->r_result; bool sparse = (op->op == CEPH_OSD_OP_SPARSE_READ); ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, osd_data->length, err); doutc(cl, "result %d subreq->len=%zu i_size=%lld\n", req->r_result, subreq->len, i_size_read(req->r_inode)); /* no object means success but no data */ if (err == -ENOENT) err = 0; else if (err == -EBLOCKLISTED) fsc->blocklisted = true; if (err >= 0) { if (sparse && err > 0) err = ceph_sparse_ext_map_end(op); if (err < subreq->len) __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); if (IS_ENCRYPTED(inode) && err > 0) { err = ceph_fscrypt_decrypt_extents(inode, osd_data->pages, subreq->start, op->extent.sparse_ext, op->extent.sparse_ext_cnt); if (err > subreq->len) err = subreq->len; } } if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { ceph_put_page_vector(osd_data->pages, calc_pages_for(osd_data->alignment, osd_data->length), false); } netfs_subreq_terminated(subreq, err, false); iput(req->r_inode); ceph_dec_osd_stopping_blocker(fsc->mdsc); } static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; struct inode *inode = rreq->inode; struct ceph_mds_reply_info_parsed *rinfo; struct ceph_mds_reply_info_in *iinfo; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_inode_info *ci = ceph_inode(inode); struct iov_iter iter; ssize_t err = 0; size_t len; int mode; __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); if (subreq->start >= inode->i_size) goto out; /* We need to fetch the inline data. */ mode = ceph_try_to_choose_auth_mds(inode, CEPH_STAT_CAP_INLINE_DATA); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode); if (IS_ERR(req)) { err = PTR_ERR(req); goto out; } req->r_ino1 = ci->i_vino; req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA); req->r_num_caps = 2; err = ceph_mdsc_do_request(mdsc, NULL, req); if (err < 0) goto out; rinfo = &req->r_reply_info; iinfo = &rinfo->targeti; if (iinfo->inline_version == CEPH_INLINE_NONE) { /* The data got uninlined */ ceph_mdsc_put_request(req); return false; } len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len); iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len); err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter); if (err == 0) err = -EFAULT; ceph_mdsc_put_request(req); out: netfs_subreq_terminated(subreq, err, false); return true; } static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; struct inode *inode = rreq->inode; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_client *cl = fsc->client; struct ceph_osd_request *req = NULL; struct ceph_vino vino = ceph_vino(inode); struct iov_iter iter; int err = 0; u64 len = subreq->len; bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD); u64 off = subreq->start; int extent_cnt; if (ceph_inode_is_shutdown(inode)) { err = -EIO; goto out; } if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq)) return; ceph_fscrypt_adjust_off_and_len(inode, &off, &len); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, off, &len, 0, 1, sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, ci->i_truncate_seq, ci->i_truncate_size, false); if (IS_ERR(req)) { err = PTR_ERR(req); req = NULL; goto out; } if (sparse) { extent_cnt = __ceph_sparse_read_ext_count(inode, len); err = ceph_alloc_sparse_ext_map(&req->r_ops[0], extent_cnt); if (err) goto out; } doutc(cl, "%llx.%llx pos=%llu orig_len=%zu len=%llu\n", ceph_vinop(inode), subreq->start, subreq->len, len); iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len); /* * FIXME: For now, use CEPH_OSD_DATA_TYPE_PAGES instead of _ITER for * encrypted inodes. We'd need infrastructure that handles an iov_iter * instead of page arrays, and we don't have that as of yet. Once the * dust settles on the write helpers and encrypt/decrypt routines for * netfs, we should be able to rework this. */ if (IS_ENCRYPTED(inode)) { struct page **pages; size_t page_off; err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off); if (err < 0) { doutc(cl, "%llx.%llx failed to allocate pages, %d\n", ceph_vinop(inode), err); goto out; } /* should always give us a page-aligned read */ WARN_ON_ONCE(page_off); len = err; err = 0; osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); } else { osd_req_op_extent_osd_iter(req, 0, &iter); } if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { err = -EIO; goto out; } req->r_callback = finish_netfs_read; req->r_priv = subreq; req->r_inode = inode; ihold(inode); ceph_osdc_start_request(req->r_osdc, req); out: ceph_osdc_put_request(req); if (err) netfs_subreq_terminated(subreq, err, false); doutc(cl, "%llx.%llx result %d\n", ceph_vinop(inode), err); } static int ceph_init_request(struct netfs_io_request *rreq, struct file *file) { struct inode *inode = rreq->inode; struct ceph_client *cl = ceph_inode_to_client(inode); int got = 0, want = CEPH_CAP_FILE_CACHE; struct ceph_netfs_request_data *priv; int ret = 0; if (rreq->origin != NETFS_READAHEAD) return 0; priv = kzalloc(sizeof(*priv), GFP_NOFS); if (!priv) return -ENOMEM; if (file) { struct ceph_rw_context *rw_ctx; struct ceph_file_info *fi = file->private_data; priv->file_ra_pages = file->f_ra.ra_pages; priv->file_ra_disabled = file->f_mode & FMODE_RANDOM; rw_ctx = ceph_find_rw_context(fi); if (rw_ctx) { rreq->netfs_priv = priv; return 0; } } /* * readahead callers do not necessarily hold Fcb caps * (e.g. fadvise, madvise). */ ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got); if (ret < 0) { doutc(cl, "%llx.%llx, error getting cap\n", ceph_vinop(inode)); goto out; } if (!(got & want)) { doutc(cl, "%llx.%llx, no cache cap\n", ceph_vinop(inode)); ret = -EACCES; goto out; } if (ret == 0) { ret = -EACCES; goto out; } priv->caps = got; rreq->netfs_priv = priv; out: if (ret < 0) kfree(priv); return ret; } static void ceph_netfs_free_request(struct netfs_io_request *rreq) { struct ceph_netfs_request_data *priv = rreq->netfs_priv; if (!priv) return; if (priv->caps) ceph_put_cap_refs(ceph_inode(rreq->inode), priv->caps); kfree(priv); rreq->netfs_priv = NULL; } const struct netfs_request_ops ceph_netfs_ops = { .init_request = ceph_init_request, .free_request = ceph_netfs_free_request, .issue_read = ceph_netfs_issue_read, .expand_readahead = ceph_netfs_expand_readahead, .clamp_length = ceph_netfs_clamp_length, .check_write_begin = ceph_netfs_check_write_begin, }; #ifdef CONFIG_CEPH_FSCACHE static void ceph_set_page_fscache(struct page *page) { set_page_fscache(page); } static void ceph_fscache_write_terminated(void *priv, ssize_t error, bool was_async) { struct inode *inode = priv; if (IS_ERR_VALUE(error) && error != -ENOBUFS) ceph_fscache_invalidate(inode, false); } static void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching) { struct ceph_inode_info *ci = ceph_inode(inode); struct fscache_cookie *cookie = ceph_fscache_cookie(ci); fscache_write_to_cache(cookie, inode->i_mapping, off, len, i_size_read(inode), ceph_fscache_write_terminated, inode, caching); } #else static inline void ceph_set_page_fscache(struct page *page) { } static inline void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching) { } #endif /* CONFIG_CEPH_FSCACHE */ struct ceph_writeback_ctl { loff_t i_size; u64 truncate_size; u32 truncate_seq; bool size_stable; bool head_snapc; }; /* * Get ref for the oldest snapc for an inode with dirty data... that is, the * only snap context we are allowed to write back. */ static struct ceph_snap_context * get_oldest_context(struct inode *inode, struct ceph_writeback_ctl *ctl, struct ceph_snap_context *page_snapc) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_snap_context *snapc = NULL; struct ceph_cap_snap *capsnap = NULL; spin_lock(&ci->i_ceph_lock); list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { doutc(cl, " capsnap %p snapc %p has %d dirty pages\n", capsnap, capsnap->context, capsnap->dirty_pages); if (!capsnap->dirty_pages) continue; /* get i_size, truncate_{seq,size} for page_snapc? */ if (snapc && capsnap->context != page_snapc) continue; if (ctl) { if (capsnap->writing) { ctl->i_size = i_size_read(inode); ctl->size_stable = false; } else { ctl->i_size = capsnap->size; ctl->size_stable = true; } ctl->truncate_size = capsnap->truncate_size; ctl->truncate_seq = capsnap->truncate_seq; ctl->head_snapc = false; } if (snapc) break; snapc = ceph_get_snap_context(capsnap->context); if (!page_snapc || page_snapc == snapc || page_snapc->seq > snapc->seq) break; } if (!snapc && ci->i_wrbuffer_ref_head) { snapc = ceph_get_snap_context(ci->i_head_snapc); doutc(cl, " head snapc %p has %d dirty pages\n", snapc, ci->i_wrbuffer_ref_head); if (ctl) { ctl->i_size = i_size_read(inode); ctl->truncate_size = ci->i_truncate_size; ctl->truncate_seq = ci->i_truncate_seq; ctl->size_stable = false; ctl->head_snapc = true; } } spin_unlock(&ci->i_ceph_lock); return snapc; } static u64 get_writepages_data_length(struct inode *inode, struct page *page, u64 start) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_snap_context *snapc; struct ceph_cap_snap *capsnap = NULL; u64 end = i_size_read(inode); u64 ret; snapc = page_snap_context(ceph_fscrypt_pagecache_page(page)); if (snapc != ci->i_head_snapc) { bool found = false; spin_lock(&ci->i_ceph_lock); list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { if (capsnap->context == snapc) { if (!capsnap->writing) end = capsnap->size; found = true; break; } } spin_unlock(&ci->i_ceph_lock); WARN_ON(!found); } if (end > ceph_fscrypt_page_offset(page) + thp_size(page)) end = ceph_fscrypt_page_offset(page) + thp_size(page); ret = end > start ? end - start : 0; if (ret && fscrypt_is_bounce_page(page)) ret = round_up(ret, CEPH_FSCRYPT_BLOCK_SIZE); return ret; } /* * Write a single page, but leave the page locked. * * If we get a write error, mark the mapping for error, but still adjust the * dirty page accounting (i.e., page is no longer dirty). */ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) { struct folio *folio = page_folio(page); struct inode *inode = page->mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_client *cl = fsc->client; struct ceph_snap_context *snapc, *oldest; loff_t page_off = page_offset(page); int err; loff_t len = thp_size(page); loff_t wlen; struct ceph_writeback_ctl ceph_wbc; struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_osd_request *req; bool caching = ceph_is_cache_enabled(inode); struct page *bounce_page = NULL; doutc(cl, "%llx.%llx page %p idx %lu\n", ceph_vinop(inode), page, page->index); if (ceph_inode_is_shutdown(inode)) return -EIO; /* verify this is a writeable snap context */ snapc = page_snap_context(page); if (!snapc) { doutc(cl, "%llx.%llx page %p not dirty?\n", ceph_vinop(inode), page); return 0; } oldest = get_oldest_context(inode, &ceph_wbc, snapc); if (snapc->seq > oldest->seq) { doutc(cl, "%llx.%llx page %p snapc %p not writeable - noop\n", ceph_vinop(inode), page, snapc); /* we should only noop if called by kswapd */ WARN_ON(!(current->flags & PF_MEMALLOC)); ceph_put_snap_context(oldest); redirty_page_for_writepage(wbc, page); return 0; } ceph_put_snap_context(oldest); /* is this a partial page at end of file? */ if (page_off >= ceph_wbc.i_size) { doutc(cl, "%llx.%llx folio at %lu beyond eof %llu\n", ceph_vinop(inode), folio->index, ceph_wbc.i_size); folio_invalidate(folio, 0, folio_size(folio)); return 0; } if (ceph_wbc.i_size < page_off + len) len = ceph_wbc.i_size - page_off; wlen = IS_ENCRYPTED(inode) ? round_up(len, CEPH_FSCRYPT_BLOCK_SIZE) : len; doutc(cl, "%llx.%llx page %p index %lu on %llu~%llu snapc %p seq %lld\n", ceph_vinop(inode), page, page->index, page_off, wlen, snapc, snapc->seq); if (atomic_long_inc_return(&fsc->writeback_count) > CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) fsc->write_congested = true; req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &wlen, 0, 1, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc, ceph_wbc.truncate_seq, ceph_wbc.truncate_size, true); if (IS_ERR(req)) { redirty_page_for_writepage(wbc, page); return PTR_ERR(req); } if (wlen < len) len = wlen; set_page_writeback(page); if (caching) ceph_set_page_fscache(page); ceph_fscache_write_to_cache(inode, page_off, len, caching); if (IS_ENCRYPTED(inode)) { bounce_page = fscrypt_encrypt_pagecache_blocks(page, CEPH_FSCRYPT_BLOCK_SIZE, 0, GFP_NOFS); if (IS_ERR(bounce_page)) { redirty_page_for_writepage(wbc, page); end_page_writeback(page); ceph_osdc_put_request(req); return PTR_ERR(bounce_page); } } /* it may be a short write due to an object boundary */ WARN_ON_ONCE(len > thp_size(page)); osd_req_op_extent_osd_data_pages(req, 0, bounce_page ? &bounce_page : &page, wlen, 0, false, false); doutc(cl, "%llx.%llx %llu~%llu (%llu bytes, %sencrypted)\n", ceph_vinop(inode), page_off, len, wlen, IS_ENCRYPTED(inode) ? "" : "not "); req->r_mtime = inode_get_mtime(inode); ceph_osdc_start_request(osdc, req); err = ceph_osdc_wait_request(osdc, req); ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, err); fscrypt_free_bounce_page(bounce_page); ceph_osdc_put_request(req); if (err == 0) err = len; if (err < 0) { struct writeback_control tmp_wbc; if (!wbc) wbc = &tmp_wbc; if (err == -ERESTARTSYS) { /* killed by SIGKILL */ doutc(cl, "%llx.%llx interrupted page %p\n", ceph_vinop(inode), page); redirty_page_for_writepage(wbc, page); end_page_writeback(page); return err; } if (err == -EBLOCKLISTED) fsc->blocklisted = true; doutc(cl, "%llx.%llx setting page/mapping error %d %p\n", ceph_vinop(inode), err, page); mapping_set_error(&inode->i_data, err); wbc->pages_skipped++; } else { doutc(cl, "%llx.%llx cleaned page %p\n", ceph_vinop(inode), page); err = 0; /* vfs expects us to return 0 */ } oldest = detach_page_private(page); WARN_ON_ONCE(oldest != snapc); end_page_writeback(page); ceph_put_wrbuffer_cap_refs(ci, 1, snapc); ceph_put_snap_context(snapc); /* page's reference */ if (atomic_long_dec_return(&fsc->writeback_count) < CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb)) fsc->write_congested = false; return err; } static int ceph_writepage(struct page *page, struct writeback_control *wbc) { int err; struct inode *inode = page->mapping->host; BUG_ON(!inode); ihold(inode); if (wbc->sync_mode == WB_SYNC_NONE && ceph_inode_to_fs_client(inode)->write_congested) return AOP_WRITEPAGE_ACTIVATE; wait_on_page_fscache(page); err = writepage_nounlock(page, wbc); if (err == -ERESTARTSYS) { /* direct memory reclaimer was killed by SIGKILL. return 0 * to prevent caller from setting mapping/page error */ err = 0; } unlock_page(page); iput(inode); return err; } /* * async writeback completion handler. * * If we get an error, set the mapping error bit, but not the individual * page error bits. */ static void writepages_finish(struct ceph_osd_request *req) { struct inode *inode = req->r_inode; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_osd_data *osd_data; struct page *page; int num_pages, total_pages = 0; int i, j; int rc = req->r_result; struct ceph_snap_context *snapc = req->r_snapc; struct address_space *mapping = inode->i_mapping; struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); unsigned int len = 0; bool remove_page; doutc(cl, "%llx.%llx rc %d\n", ceph_vinop(inode), rc); if (rc < 0) { mapping_set_error(mapping, rc); ceph_set_error_write(ci); if (rc == -EBLOCKLISTED) fsc->blocklisted = true; } else { ceph_clear_error_write(ci); } /* * We lost the cache cap, need to truncate the page before * it is unlocked, otherwise we'd truncate it later in the * page truncation thread, possibly losing some data that * raced its way in */ remove_page = !(ceph_caps_issued(ci) & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)); /* clean all pages */ for (i = 0; i < req->r_num_ops; i++) { if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) { pr_warn_client(cl, "%llx.%llx incorrect op %d req %p index %d tid %llu\n", ceph_vinop(inode), req->r_ops[i].op, req, i, req->r_tid); break; } osd_data = osd_req_op_extent_osd_data(req, i); BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); len += osd_data->length; num_pages = calc_pages_for((u64)osd_data->alignment, (u64)osd_data->length); total_pages += num_pages; for (j = 0; j < num_pages; j++) { page = osd_data->pages[j]; if (fscrypt_is_bounce_page(page)) { page = fscrypt_pagecache_page(page); fscrypt_free_bounce_page(osd_data->pages[j]); osd_data->pages[j] = page; } BUG_ON(!page); WARN_ON(!PageUptodate(page)); if (atomic_long_dec_return(&fsc->writeback_count) < CONGESTION_OFF_THRESH( fsc->mount_options->congestion_kb)) fsc->write_congested = false; ceph_put_snap_context(detach_page_private(page)); end_page_writeback(page); doutc(cl, "unlocking %p\n", page); if (remove_page) generic_error_remove_folio(inode->i_mapping, page_folio(page)); unlock_page(page); } doutc(cl, "%llx.%llx wrote %llu bytes cleaned %d pages\n", ceph_vinop(inode), osd_data->length, rc >= 0 ? num_pages : 0); release_pages(osd_data->pages, num_pages); } ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, rc); ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc); osd_data = osd_req_op_extent_osd_data(req, 0); if (osd_data->pages_from_pool) mempool_free(osd_data->pages, ceph_wb_pagevec_pool); else kfree(osd_data->pages); ceph_osdc_put_request(req); ceph_dec_osd_stopping_blocker(fsc->mdsc); } /* * initiate async writeback */ static int ceph_writepages_start(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_client *cl = fsc->client; struct ceph_vino vino = ceph_vino(inode); pgoff_t index, start_index, end = -1; struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; struct folio_batch fbatch; int rc = 0; unsigned int wsize = i_blocksize(inode); struct ceph_osd_request *req = NULL; struct ceph_writeback_ctl ceph_wbc; bool should_loop, range_whole = false; bool done = false; bool caching = ceph_is_cache_enabled(inode); xa_mark_t tag; if (wbc->sync_mode == WB_SYNC_NONE && fsc->write_congested) return 0; doutc(cl, "%llx.%llx (mode=%s)\n", ceph_vinop(inode), wbc->sync_mode == WB_SYNC_NONE ? "NONE" : (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); if (ceph_inode_is_shutdown(inode)) { if (ci->i_wrbuffer_ref > 0) { pr_warn_ratelimited_client(cl, "%llx.%llx %lld forced umount\n", ceph_vinop(inode), ceph_ino(inode)); } mapping_set_error(mapping, -EIO); return -EIO; /* we're in a forced umount, don't write! */ } if (fsc->mount_options->wsize < wsize) wsize = fsc->mount_options->wsize; folio_batch_init(&fbatch); start_index = wbc->range_cyclic ? mapping->writeback_index : 0; index = start_index; if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) { tag = PAGECACHE_TAG_TOWRITE; } else { tag = PAGECACHE_TAG_DIRTY; } retry: /* find oldest snap context with dirty data */ snapc = get_oldest_context(inode, &ceph_wbc, NULL); if (!snapc) { /* hmm, why does writepages get called when there is no dirty data? */ doutc(cl, " no snap context with dirty data?\n"); goto out; } doutc(cl, " oldest snapc is %p seq %lld (%d snaps)\n", snapc, snapc->seq, snapc->num_snaps); should_loop = false; if (ceph_wbc.head_snapc && snapc != last_snapc) { /* where to start/end? */ if (wbc->range_cyclic) { index = start_index; end = -1; if (index > 0) should_loop = true; doutc(cl, " cyclic, start at %lu\n", index); } else { index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = true; doutc(cl, " not cyclic, %lu to %lu\n", index, end); } } else if (!ceph_wbc.head_snapc) { /* Do not respect wbc->range_{start,end}. Dirty pages * in that range can be associated with newer snapc. * They are not writeable until we write all dirty pages * associated with 'snapc' get written */ if (index > 0) should_loop = true; doutc(cl, " non-head snapc, range whole\n"); } if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); ceph_put_snap_context(last_snapc); last_snapc = snapc; while (!done && index <= end) { int num_ops = 0, op_idx; unsigned i, nr_folios, max_pages, locked_pages = 0; struct page **pages = NULL, **data_pages; struct page *page; pgoff_t strip_unit_end = 0; u64 offset = 0, len = 0; bool from_pool = false; max_pages = wsize >> PAGE_SHIFT; get_more_pages: nr_folios = filemap_get_folios_tag(mapping, &index, end, tag, &fbatch); doutc(cl, "pagevec_lookup_range_tag got %d\n", nr_folios); if (!nr_folios && !locked_pages) break; for (i = 0; i < nr_folios && locked_pages < max_pages; i++) { page = &fbatch.folios[i]->page; doutc(cl, "? %p idx %lu\n", page, page->index); if (locked_pages == 0) lock_page(page); /* first page */ else if (!trylock_page(page)) break; /* only dirty pages, or our accounting breaks */ if (unlikely(!PageDirty(page)) || unlikely(page->mapping != mapping)) { doutc(cl, "!dirty or !mapping %p\n", page); unlock_page(page); continue; } /* only if matching snap context */ pgsnapc = page_snap_context(page); if (pgsnapc != snapc) { doutc(cl, "page snapc %p %lld != oldest %p %lld\n", pgsnapc, pgsnapc->seq, snapc, snapc->seq); if (!should_loop && !ceph_wbc.head_snapc && wbc->sync_mode != WB_SYNC_NONE) should_loop = true; unlock_page(page); continue; } if (page_offset(page) >= ceph_wbc.i_size) { struct folio *folio = page_folio(page); doutc(cl, "folio at %lu beyond eof %llu\n", folio->index, ceph_wbc.i_size); if ((ceph_wbc.size_stable || folio_pos(folio) >= i_size_read(inode)) && folio_clear_dirty_for_io(folio)) folio_invalidate(folio, 0, folio_size(folio)); folio_unlock(folio); continue; } if (strip_unit_end && (page->index > strip_unit_end)) { doutc(cl, "end of strip unit %p\n", page); unlock_page(page); break; } if (PageWriteback(page) || PageFsCache(page)) { if (wbc->sync_mode == WB_SYNC_NONE) { doutc(cl, "%p under writeback\n", page); unlock_page(page); continue; } doutc(cl, "waiting on writeback %p\n", page); wait_on_page_writeback(page); wait_on_page_fscache(page); } if (!clear_page_dirty_for_io(page)) { doutc(cl, "%p !clear_page_dirty_for_io\n", page); unlock_page(page); continue; } /* * We have something to write. If this is * the first locked page this time through, * calculate max possinle write size and * allocate a page array */ if (locked_pages == 0) { u64 objnum; u64 objoff; u32 xlen; /* prepare async write request */ offset = (u64)page_offset(page); ceph_calc_file_object_mapping(&ci->i_layout, offset, wsize, &objnum, &objoff, &xlen); len = xlen; num_ops = 1; strip_unit_end = page->index + ((len - 1) >> PAGE_SHIFT); BUG_ON(pages); max_pages = calc_pages_for(0, (u64)len); pages = kmalloc_array(max_pages, sizeof(*pages), GFP_NOFS); if (!pages) { from_pool = true; pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS); BUG_ON(!pages); } len = 0; } else if (page->index != (offset + len) >> PAGE_SHIFT) { if (num_ops >= (from_pool ? CEPH_OSD_SLAB_OPS : CEPH_OSD_MAX_OPS)) { redirty_page_for_writepage(wbc, page); unlock_page(page); break; } num_ops++; offset = (u64)page_offset(page); len = 0; } /* note position of first page in fbatch */ doutc(cl, "%llx.%llx will write page %p idx %lu\n", ceph_vinop(inode), page, page->index); if (atomic_long_inc_return(&fsc->writeback_count) > CONGESTION_ON_THRESH( fsc->mount_options->congestion_kb)) fsc->write_congested = true; if (IS_ENCRYPTED(inode)) { pages[locked_pages] = fscrypt_encrypt_pagecache_blocks(page, PAGE_SIZE, 0, locked_pages ? GFP_NOWAIT : GFP_NOFS); if (IS_ERR(pages[locked_pages])) { if (PTR_ERR(pages[locked_pages]) == -EINVAL) pr_err_client(cl, "inode->i_blkbits=%hhu\n", inode->i_blkbits); /* better not fail on first page! */ BUG_ON(locked_pages == 0); pages[locked_pages] = NULL; redirty_page_for_writepage(wbc, page); unlock_page(page); break; } ++locked_pages; } else { pages[locked_pages++] = page; } fbatch.folios[i] = NULL; len += thp_size(page); } /* did we get anything? */ if (!locked_pages) goto release_folios; if (i) { unsigned j, n = 0; /* shift unused page to beginning of fbatch */ for (j = 0; j < nr_folios; j++) { if (!fbatch.folios[j]) continue; if (n < j) fbatch.folios[n] = fbatch.folios[j]; n++; } fbatch.nr = n; if (nr_folios && i == nr_folios && locked_pages < max_pages) { doutc(cl, "reached end fbatch, trying for more\n"); folio_batch_release(&fbatch); goto get_more_pages; } } new_request: offset = ceph_fscrypt_page_offset(pages[0]); len = wsize; req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, offset, &len, 0, num_ops, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc, ceph_wbc.truncate_seq, ceph_wbc.truncate_size, false); if (IS_ERR(req)) { req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, offset, &len, 0, min(num_ops, CEPH_OSD_SLAB_OPS), CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc, ceph_wbc.truncate_seq, ceph_wbc.truncate_size, true); BUG_ON(IS_ERR(req)); } BUG_ON(len < ceph_fscrypt_page_offset(pages[locked_pages - 1]) + thp_size(pages[locked_pages - 1]) - offset); if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { rc = -EIO; goto release_folios; } req->r_callback = writepages_finish; req->r_inode = inode; /* Format the osd request message and submit the write */ len = 0; data_pages = pages; op_idx = 0; for (i = 0; i < locked_pages; i++) { struct page *page = ceph_fscrypt_pagecache_page(pages[i]); u64 cur_offset = page_offset(page); /* * Discontinuity in page range? Ceph can handle that by just passing * multiple extents in the write op. */ if (offset + len != cur_offset) { /* If it's full, stop here */ if (op_idx + 1 == req->r_num_ops) break; /* Kick off an fscache write with what we have so far. */ ceph_fscache_write_to_cache(inode, offset, len, caching); /* Start a new extent */ osd_req_op_extent_dup_last(req, op_idx, cur_offset - offset); doutc(cl, "got pages at %llu~%llu\n", offset, len); osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len, 0, from_pool, false); osd_req_op_extent_update(req, op_idx, len); len = 0; offset = cur_offset; data_pages = pages + i; op_idx++; } set_page_writeback(page); if (caching) ceph_set_page_fscache(page); len += thp_size(page); } ceph_fscache_write_to_cache(inode, offset, len, caching); if (ceph_wbc.size_stable) { len = min(len, ceph_wbc.i_size - offset); } else if (i == locked_pages) { /* writepages_finish() clears writeback pages * according to the data length, so make sure * data length covers all locked pages */ u64 min_len = len + 1 - thp_size(page); len = get_writepages_data_length(inode, pages[i - 1], offset); len = max(len, min_len); } if (IS_ENCRYPTED(inode)) len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE); doutc(cl, "got pages at %llu~%llu\n", offset, len); if (IS_ENCRYPTED(inode) && ((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK)) pr_warn_client(cl, "bad encrypted write offset=%lld len=%llu\n", offset, len); osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len, 0, from_pool, false); osd_req_op_extent_update(req, op_idx, len); BUG_ON(op_idx + 1 != req->r_num_ops); from_pool = false; if (i < locked_pages) { BUG_ON(num_ops <= req->r_num_ops); num_ops -= req->r_num_ops; locked_pages -= i; /* allocate new pages array for next request */ data_pages = pages; pages = kmalloc_array(locked_pages, sizeof(*pages), GFP_NOFS); if (!pages) { from_pool = true; pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS); BUG_ON(!pages); } memcpy(pages, data_pages + i, locked_pages * sizeof(*pages)); memset(data_pages + i, 0, locked_pages * sizeof(*pages)); } else { BUG_ON(num_ops != req->r_num_ops); index = pages[i - 1]->index + 1; /* request message now owns the pages array */ pages = NULL; } req->r_mtime = inode_get_mtime(inode); ceph_osdc_start_request(&fsc->client->osdc, req); req = NULL; wbc->nr_to_write -= i; if (pages) goto new_request; /* * We stop writing back only if we are not doing * integrity sync. In case of integrity sync we have to * keep going until we have written all the pages * we tagged for writeback prior to entering this loop. */ if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) done = true; release_folios: doutc(cl, "folio_batch release on %d folios (%p)\n", (int)fbatch.nr, fbatch.nr ? fbatch.folios[0] : NULL); folio_batch_release(&fbatch); } if (should_loop && !done) { /* more to do; loop back to beginning of file */ doutc(cl, "looping back to beginning of file\n"); end = start_index - 1; /* OK even when start_index == 0 */ /* to write dirty pages associated with next snapc, * we need to wait until current writes complete */ if (wbc->sync_mode != WB_SYNC_NONE && start_index == 0 && /* all dirty pages were checked */ !ceph_wbc.head_snapc) { struct page *page; unsigned i, nr; index = 0; while ((index <= end) && (nr = filemap_get_folios_tag(mapping, &index, (pgoff_t)-1, PAGECACHE_TAG_WRITEBACK, &fbatch))) { for (i = 0; i < nr; i++) { page = &fbatch.folios[i]->page; if (page_snap_context(page) != snapc) continue; wait_on_page_writeback(page); } folio_batch_release(&fbatch); cond_resched(); } } start_index = 0; index = 0; goto retry; } if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = index; out: ceph_osdc_put_request(req); ceph_put_snap_context(last_snapc); doutc(cl, "%llx.%llx dend - startone, rc = %d\n", ceph_vinop(inode), rc); return rc; } /* * See if a given @snapc is either writeable, or already written. */ static int context_is_writeable_or_written(struct inode *inode, struct ceph_snap_context *snapc) { struct ceph_snap_context *oldest = get_oldest_context(inode, NULL, NULL); int ret = !oldest || snapc->seq <= oldest->seq; ceph_put_snap_context(oldest); return ret; } /** * ceph_find_incompatible - find an incompatible context and return it * @page: page being dirtied * * We are only allowed to write into/dirty a page if the page is * clean, or already dirty within the same snap context. Returns a * conflicting context if there is one, NULL if there isn't, or a * negative error code on other errors. * * Must be called with page lock held. */ static struct ceph_snap_context * ceph_find_incompatible(struct page *page) { struct inode *inode = page->mapping->host; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); if (ceph_inode_is_shutdown(inode)) { doutc(cl, " %llx.%llx page %p is shutdown\n", ceph_vinop(inode), page); return ERR_PTR(-ESTALE); } for (;;) { struct ceph_snap_context *snapc, *oldest; wait_on_page_writeback(page); snapc = page_snap_context(page); if (!snapc || snapc == ci->i_head_snapc) break; /* * this page is already dirty in another (older) snap * context! is it writeable now? */ oldest = get_oldest_context(inode, NULL, NULL); if (snapc->seq > oldest->seq) { /* not writeable -- return it for the caller to deal with */ ceph_put_snap_context(oldest); doutc(cl, " %llx.%llx page %p snapc %p not current or oldest\n", ceph_vinop(inode), page, snapc); return ceph_get_snap_context(snapc); } ceph_put_snap_context(oldest); /* yay, writeable, do it now (without dropping page lock) */ doutc(cl, " %llx.%llx page %p snapc %p not current, but oldest\n", ceph_vinop(inode), page, snapc); if (clear_page_dirty_for_io(page)) { int r = writepage_nounlock(page, NULL); if (r < 0) return ERR_PTR(r); } } return NULL; } static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len, struct folio **foliop, void **_fsdata) { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_snap_context *snapc; snapc = ceph_find_incompatible(folio_page(*foliop, 0)); if (snapc) { int r; folio_unlock(*foliop); folio_put(*foliop); *foliop = NULL; if (IS_ERR(snapc)) return PTR_ERR(snapc); ceph_queue_writeback(inode); r = wait_event_killable(ci->i_cap_wq, context_is_writeable_or_written(inode, snapc)); ceph_put_snap_context(snapc); return r == 0 ? -EAGAIN : r; } return 0; } /* * We are only allowed to write into/dirty the page if the page is * clean, or already dirty within the same snap context. */ static int ceph_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, void **fsdata) { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct folio *folio = NULL; int r; r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, &folio, NULL); if (r < 0) return r; folio_wait_fscache(folio); WARN_ON_ONCE(!folio_test_locked(folio)); *pagep = &folio->page; return 0; } /* * we don't do anything in here that simple_write_end doesn't do * except adjust dirty page accounting */ static int ceph_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *subpage, void *fsdata) { struct folio *folio = page_folio(subpage); struct inode *inode = file_inode(file); struct ceph_client *cl = ceph_inode_to_client(inode); bool check_cap = false; doutc(cl, "%llx.%llx file %p folio %p %d~%d (%d)\n", ceph_vinop(inode), file, folio, (int)pos, (int)copied, (int)len); if (!folio_test_uptodate(folio)) { /* just return that nothing was copied on a short copy */ if (copied < len) { copied = 0; goto out; } folio_mark_uptodate(folio); } /* did file size increase? */ if (pos+copied > i_size_read(inode)) check_cap = ceph_inode_set_size(inode, pos+copied); folio_mark_dirty(folio); out: folio_unlock(folio); folio_put(folio); if (check_cap) ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY); return copied; } const struct address_space_operations ceph_aops = { .read_folio = netfs_read_folio, .readahead = netfs_readahead, .writepage = ceph_writepage, .writepages = ceph_writepages_start, .write_begin = ceph_write_begin, .write_end = ceph_write_end, .dirty_folio = ceph_dirty_folio, .invalidate_folio = ceph_invalidate_folio, .release_folio = netfs_release_folio, .direct_IO = noop_direct_IO, }; static void ceph_block_sigs(sigset_t *oldset) { sigset_t mask; siginitsetinv(&mask, sigmask(SIGKILL)); sigprocmask(SIG_BLOCK, &mask, oldset); } static void ceph_restore_sigs(sigset_t *oldset) { sigprocmask(SIG_SETMASK, oldset, NULL); } /* * vm ops */ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_file_info *fi = vma->vm_file->private_data; loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT; int want, got, err; sigset_t oldset; vm_fault_t ret = VM_FAULT_SIGBUS; if (ceph_inode_is_shutdown(inode)) return ret; ceph_block_sigs(&oldset); doutc(cl, "%llx.%llx %llu trying to get caps\n", ceph_vinop(inode), off); if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_CACHE; got = 0; err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1, &got); if (err < 0) goto out_restore; doutc(cl, "%llx.%llx %llu got cap refs on %s\n", ceph_vinop(inode), off, ceph_cap_string(got)); if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || !ceph_has_inline_data(ci)) { CEPH_DEFINE_RW_CONTEXT(rw_ctx, got); ceph_add_rw_context(fi, &rw_ctx); ret = filemap_fault(vmf); ceph_del_rw_context(fi, &rw_ctx); doutc(cl, "%llx.%llx %llu drop cap refs %s ret %x\n", ceph_vinop(inode), off, ceph_cap_string(got), ret); } else err = -EAGAIN; ceph_put_cap_refs(ci, got); if (err != -EAGAIN) goto out_restore; /* read inline data */ if (off >= PAGE_SIZE) { /* does not support inline data > PAGE_SIZE */ ret = VM_FAULT_SIGBUS; } else { struct address_space *mapping = inode->i_mapping; struct page *page; filemap_invalidate_lock_shared(mapping); page = find_or_create_page(mapping, 0, mapping_gfp_constraint(mapping, ~__GFP_FS)); if (!page) { ret = VM_FAULT_OOM; goto out_inline; } err = __ceph_do_getattr(inode, page, CEPH_STAT_CAP_INLINE_DATA, true); if (err < 0 || off >= i_size_read(inode)) { unlock_page(page); put_page(page); ret = vmf_error(err); goto out_inline; } if (err < PAGE_SIZE) zero_user_segment(page, err, PAGE_SIZE); else flush_dcache_page(page); SetPageUptodate(page); vmf->page = page; ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; out_inline: filemap_invalidate_unlock_shared(mapping); doutc(cl, "%llx.%llx %llu read inline data ret %x\n", ceph_vinop(inode), off, ret); } out_restore: ceph_restore_sigs(&oldset); if (err < 0) ret = vmf_error(err); return ret; } static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_info *fi = vma->vm_file->private_data; struct ceph_cap_flush *prealloc_cf; struct page *page = vmf->page; loff_t off = page_offset(page); loff_t size = i_size_read(inode); size_t len; int want, got, err; sigset_t oldset; vm_fault_t ret = VM_FAULT_SIGBUS; if (ceph_inode_is_shutdown(inode)) return ret; prealloc_cf = ceph_alloc_cap_flush(); if (!prealloc_cf) return VM_FAULT_OOM; sb_start_pagefault(inode->i_sb); ceph_block_sigs(&oldset); if (off + thp_size(page) <= size) len = thp_size(page); else len = offset_in_thp(page, size); doutc(cl, "%llx.%llx %llu~%zd getting caps i_size %llu\n", ceph_vinop(inode), off, len, size); if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_BUFFER; got = 0; err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len, &got); if (err < 0) goto out_free; doutc(cl, "%llx.%llx %llu~%zd got cap refs on %s\n", ceph_vinop(inode), off, len, ceph_cap_string(got)); /* Update time before taking page lock */ file_update_time(vma->vm_file); inode_inc_iversion_raw(inode); do { struct ceph_snap_context *snapc; lock_page(page); if (page_mkwrite_check_truncate(page, inode) < 0) { unlock_page(page); ret = VM_FAULT_NOPAGE; break; } snapc = ceph_find_incompatible(page); if (!snapc) { /* success. we'll keep the page locked. */ set_page_dirty(page); ret = VM_FAULT_LOCKED; break; } unlock_page(page); if (IS_ERR(snapc)) { ret = VM_FAULT_SIGBUS; break; } ceph_queue_writeback(inode); err = wait_event_killable(ci->i_cap_wq, context_is_writeable_or_written(inode, snapc)); ceph_put_snap_context(snapc); } while (err == 0); if (ret == VM_FAULT_LOCKED) { int dirty; spin_lock(&ci->i_ceph_lock); dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf); spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); } doutc(cl, "%llx.%llx %llu~%zd dropping cap refs on %s ret %x\n", ceph_vinop(inode), off, len, ceph_cap_string(got), ret); ceph_put_cap_refs_async(ci, got); out_free: ceph_restore_sigs(&oldset); sb_end_pagefault(inode->i_sb); ceph_free_cap_flush(prealloc_cf); if (err < 0) ret = vmf_error(err); return ret; } void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, char *data, size_t len) { struct ceph_client *cl = ceph_inode_to_client(inode); struct address_space *mapping = inode->i_mapping; struct page *page; if (locked_page) { page = locked_page; } else { if (i_size_read(inode) == 0) return; page = find_or_create_page(mapping, 0, mapping_gfp_constraint(mapping, ~__GFP_FS)); if (!page) return; if (PageUptodate(page)) { unlock_page(page); put_page(page); return; } } doutc(cl, "%p %llx.%llx len %zu locked_page %p\n", inode, ceph_vinop(inode), len, locked_page); if (len > 0) { void *kaddr = kmap_atomic(page); memcpy(kaddr, data, len); kunmap_atomic(kaddr); } if (page != locked_page) { if (len < PAGE_SIZE) zero_user_segment(page, len, PAGE_SIZE); else flush_dcache_page(page); SetPageUptodate(page); unlock_page(page); put_page(page); } } int ceph_uninline_data(struct file *file) { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_client *cl = fsc->client; struct ceph_osd_request *req = NULL; struct ceph_cap_flush *prealloc_cf = NULL; struct folio *folio = NULL; u64 inline_version = CEPH_INLINE_NONE; struct page *pages[1]; int err = 0; u64 len; spin_lock(&ci->i_ceph_lock); inline_version = ci->i_inline_version; spin_unlock(&ci->i_ceph_lock); doutc(cl, "%llx.%llx inline_version %llu\n", ceph_vinop(inode), inline_version); if (ceph_inode_is_shutdown(inode)) { err = -EIO; goto out; } if (inline_version == CEPH_INLINE_NONE) return 0; prealloc_cf = ceph_alloc_cap_flush(); if (!prealloc_cf) return -ENOMEM; if (inline_version == 1) /* initial version, no data */ goto out_uninline; folio = read_mapping_folio(inode->i_mapping, 0, file); if (IS_ERR(folio)) { err = PTR_ERR(folio); goto out; } folio_lock(folio); len = i_size_read(inode); if (len > folio_size(folio)) len = folio_size(folio); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), 0, &len, 0, 1, CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_WRITE, NULL, 0, 0, false); if (IS_ERR(req)) { err = PTR_ERR(req); goto out_unlock; } req->r_mtime = inode_get_mtime(inode); ceph_osdc_start_request(&fsc->client->osdc, req); err = ceph_osdc_wait_request(&fsc->client->osdc, req); ceph_osdc_put_request(req); if (err < 0) goto out_unlock; req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), 0, &len, 1, 3, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, NULL, ci->i_truncate_seq, ci->i_truncate_size, false); if (IS_ERR(req)) { err = PTR_ERR(req); goto out_unlock; } pages[0] = folio_page(folio, 0); osd_req_op_extent_osd_data_pages(req, 1, pages, len, 0, false, false); { __le64 xattr_buf = cpu_to_le64(inline_version); err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR, "inline_version", &xattr_buf, sizeof(xattr_buf), CEPH_OSD_CMPXATTR_OP_GT, CEPH_OSD_CMPXATTR_MODE_U64); if (err) goto out_put_req; } { char xattr_buf[32]; int xattr_len = snprintf(xattr_buf, sizeof(xattr_buf), "%llu", inline_version); err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR, "inline_version", xattr_buf, xattr_len, 0, 0); if (err) goto out_put_req; } req->r_mtime = inode_get_mtime(inode); ceph_osdc_start_request(&fsc->client->osdc, req); err = ceph_osdc_wait_request(&fsc->client->osdc, req); ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, err); out_uninline: if (!err) { int dirty; /* Set to CAP_INLINE_NONE and dirty the caps */ down_read(&fsc->mdsc->snap_rwsem); spin_lock(&ci->i_ceph_lock); ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf); spin_unlock(&ci->i_ceph_lock); up_read(&fsc->mdsc->snap_rwsem); if (dirty) __mark_inode_dirty(inode, dirty); } out_put_req: ceph_osdc_put_request(req); if (err == -ECANCELED) err = 0; out_unlock: if (folio) { folio_unlock(folio); folio_put(folio); } out: ceph_free_cap_flush(prealloc_cf); doutc(cl, "%llx.%llx inline_version %llu = %d\n", ceph_vinop(inode), inline_version, err); return err; } static const struct vm_operations_struct ceph_vmops = { .fault = ceph_filemap_fault, .page_mkwrite = ceph_page_mkwrite, }; int ceph_mmap(struct file *file, struct vm_area_struct *vma) { struct address_space *mapping = file->f_mapping; if (!mapping->a_ops->read_folio) return -ENOEXEC; vma->vm_ops = &ceph_vmops; return 0; } enum { POOL_READ = 1, POOL_WRITE = 2, }; static int __ceph_pool_perm_get(struct ceph_inode_info *ci, s64 pool, struct ceph_string *pool_ns) { struct ceph_fs_client *fsc = ceph_inode_to_fs_client(&ci->netfs.inode); struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_client *cl = fsc->client; struct ceph_osd_request *rd_req = NULL, *wr_req = NULL; struct rb_node **p, *parent; struct ceph_pool_perm *perm; struct page **pages; size_t pool_ns_len; int err = 0, err2 = 0, have = 0; down_read(&mdsc->pool_perm_rwsem); p = &mdsc->pool_perm_tree.rb_node; while (*p) { perm = rb_entry(*p, struct ceph_pool_perm, node); if (pool < perm->pool) p = &(*p)->rb_left; else if (pool > perm->pool) p = &(*p)->rb_right; else { int ret = ceph_compare_string(pool_ns, perm->pool_ns, perm->pool_ns_len); if (ret < 0) p = &(*p)->rb_left; else if (ret > 0) p = &(*p)->rb_right; else { have = perm->perm; break; } } } up_read(&mdsc->pool_perm_rwsem); if (*p) goto out; if (pool_ns) doutc(cl, "pool %lld ns %.*s no perm cached\n", pool, (int)pool_ns->len, pool_ns->str); else doutc(cl, "pool %lld no perm cached\n", pool); down_write(&mdsc->pool_perm_rwsem); p = &mdsc->pool_perm_tree.rb_node; parent = NULL; while (*p) { parent = *p; perm = rb_entry(parent, struct ceph_pool_perm, node); if (pool < perm->pool) p = &(*p)->rb_left; else if (pool > perm->pool) p = &(*p)->rb_right; else { int ret = ceph_compare_string(pool_ns, perm->pool_ns, perm->pool_ns_len); if (ret < 0) p = &(*p)->rb_left; else if (ret > 0) p = &(*p)->rb_right; else { have = perm->perm; break; } } } if (*p) { up_write(&mdsc->pool_perm_rwsem); goto out; } rd_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL, 1, false, GFP_NOFS); if (!rd_req) { err = -ENOMEM; goto out_unlock; } rd_req->r_flags = CEPH_OSD_FLAG_READ; osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0); rd_req->r_base_oloc.pool = pool; if (pool_ns) rd_req->r_base_oloc.pool_ns = ceph_get_string(pool_ns); ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino); err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS); if (err) goto out_unlock; wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL, 1, false, GFP_NOFS); if (!wr_req) { err = -ENOMEM; goto out_unlock; } wr_req->r_flags = CEPH_OSD_FLAG_WRITE; osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL); ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc); ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid); err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS); if (err) goto out_unlock; /* one page should be large enough for STAT data */ pages = ceph_alloc_page_vector(1, GFP_KERNEL); if (IS_ERR(pages)) { err = PTR_ERR(pages); goto out_unlock; } osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE, 0, false, true); ceph_osdc_start_request(&fsc->client->osdc, rd_req); wr_req->r_mtime = inode_get_mtime(&ci->netfs.inode); ceph_osdc_start_request(&fsc->client->osdc, wr_req); err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req); err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req); if (err >= 0 || err == -ENOENT) have |= POOL_READ; else if (err != -EPERM) { if (err == -EBLOCKLISTED) fsc->blocklisted = true; goto out_unlock; } if (err2 == 0 || err2 == -EEXIST) have |= POOL_WRITE; else if (err2 != -EPERM) { if (err2 == -EBLOCKLISTED) fsc->blocklisted = true; err = err2; goto out_unlock; } pool_ns_len = pool_ns ? pool_ns->len : 0; perm = kmalloc(sizeof(*perm) + pool_ns_len + 1, GFP_NOFS); if (!perm) { err = -ENOMEM; goto out_unlock; } perm->pool = pool; perm->perm = have; perm->pool_ns_len = pool_ns_len; if (pool_ns_len > 0) memcpy(perm->pool_ns, pool_ns->str, pool_ns_len); perm->pool_ns[pool_ns_len] = 0; rb_link_node(&perm->node, parent, p); rb_insert_color(&perm->node, &mdsc->pool_perm_tree); err = 0; out_unlock: up_write(&mdsc->pool_perm_rwsem); ceph_osdc_put_request(rd_req); ceph_osdc_put_request(wr_req); out: if (!err) err = have; if (pool_ns) doutc(cl, "pool %lld ns %.*s result = %d\n", pool, (int)pool_ns->len, pool_ns->str, err); else doutc(cl, "pool %lld result = %d\n", pool, err); return err; } int ceph_pool_perm_check(struct inode *inode, int need) { struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_string *pool_ns; s64 pool; int ret, flags; /* Only need to do this for regular files */ if (!S_ISREG(inode->i_mode)) return 0; if (ci->i_vino.snap != CEPH_NOSNAP) { /* * Pool permission check needs to write to the first object. * But for snapshot, head of the first object may have alread * been deleted. Skip check to avoid creating orphan object. */ return 0; } if (ceph_test_mount_opt(ceph_inode_to_fs_client(inode), NOPOOLPERM)) return 0; spin_lock(&ci->i_ceph_lock); flags = ci->i_ceph_flags; pool = ci->i_layout.pool_id; spin_unlock(&ci->i_ceph_lock); check: if (flags & CEPH_I_POOL_PERM) { if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) { doutc(cl, "pool %lld no read perm\n", pool); return -EPERM; } if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) { doutc(cl, "pool %lld no write perm\n", pool); return -EPERM; } return 0; } pool_ns = ceph_try_get_string(ci->i_layout.pool_ns); ret = __ceph_pool_perm_get(ci, pool, pool_ns); ceph_put_string(pool_ns); if (ret < 0) return ret; flags = CEPH_I_POOL_PERM; if (ret & POOL_READ) flags |= CEPH_I_POOL_RD; if (ret & POOL_WRITE) flags |= CEPH_I_POOL_WR; spin_lock(&ci->i_ceph_lock); if (pool == ci->i_layout.pool_id && pool_ns == rcu_dereference_raw(ci->i_layout.pool_ns)) { ci->i_ceph_flags |= flags; } else { pool = ci->i_layout.pool_id; flags = ci->i_ceph_flags; } spin_unlock(&ci->i_ceph_lock); goto check; } void ceph_pool_perm_destroy(struct ceph_mds_client *mdsc) { struct ceph_pool_perm *perm; struct rb_node *n; while (!RB_EMPTY_ROOT(&mdsc->pool_perm_tree)) { n = rb_first(&mdsc->pool_perm_tree); perm = rb_entry(n, struct ceph_pool_perm, node); rb_erase(n, &mdsc->pool_perm_tree); kfree(perm); } } |
3 3 12 1 7 4 1 4 1 3 8 1 9 2 81 10 50 1 1 1 1 1 1 1 1 1 1 1 1 1 3 6 7 7 4 11 8 8 1 6 7 2 2 2 1 3 1 4 54 1 6 46 1 1 1 1 1 1 1 1 12 12 10 13 1 1 1 10 10 1 9 10 11 11 5 3 6 2 2 7 1 1 1 1 1 12 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 10 10 10 1 1 20 20 32 23 10 3 3 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 | // SPDX-License-Identifier: GPL-2.0 /* * Wireless utility functions * * Copyright 2007-2009 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH * Copyright (C) 2018-2023 Intel Corporation */ #include <linux/export.h> #include <linux/bitops.h> #include <linux/etherdevice.h> #include <linux/slab.h> #include <linux/ieee80211.h> #include <net/cfg80211.h> #include <net/ip.h> #include <net/dsfield.h> #include <linux/if_vlan.h> #include <linux/mpls.h> #include <linux/gcd.h> #include <linux/bitfield.h> #include <linux/nospec.h> #include "core.h" #include "rdev-ops.h" const struct ieee80211_rate * ieee80211_get_response_rate(struct ieee80211_supported_band *sband, u32 basic_rates, int bitrate) { struct ieee80211_rate *result = &sband->bitrates[0]; int i; for (i = 0; i < sband->n_bitrates; i++) { if (!(basic_rates & BIT(i))) continue; if (sband->bitrates[i].bitrate > bitrate) continue; result = &sband->bitrates[i]; } return result; } EXPORT_SYMBOL(ieee80211_get_response_rate); u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband) { struct ieee80211_rate *bitrates; u32 mandatory_rates = 0; enum ieee80211_rate_flags mandatory_flag; int i; if (WARN_ON(!sband)) return 1; if (sband->band == NL80211_BAND_2GHZ) mandatory_flag = IEEE80211_RATE_MANDATORY_B; else mandatory_flag = IEEE80211_RATE_MANDATORY_A; bitrates = sband->bitrates; for (i = 0; i < sband->n_bitrates; i++) if (bitrates[i].flags & mandatory_flag) mandatory_rates |= BIT(i); return mandatory_rates; } EXPORT_SYMBOL(ieee80211_mandatory_rates); u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band) { /* see 802.11 17.3.8.3.2 and Annex J * there are overlapping channel numbers in 5GHz and 2GHz bands */ if (chan <= 0) return 0; /* not supported */ switch (band) { case NL80211_BAND_2GHZ: case NL80211_BAND_LC: if (chan == 14) return MHZ_TO_KHZ(2484); else if (chan < 14) return MHZ_TO_KHZ(2407 + chan * 5); break; case NL80211_BAND_5GHZ: if (chan >= 182 && chan <= 196) return MHZ_TO_KHZ(4000 + chan * 5); else return MHZ_TO_KHZ(5000 + chan * 5); break; case NL80211_BAND_6GHZ: /* see 802.11ax D6.1 27.3.23.2 */ if (chan == 2) return MHZ_TO_KHZ(5935); if (chan <= 233) return MHZ_TO_KHZ(5950 + chan * 5); break; case NL80211_BAND_60GHZ: if (chan < 7) return MHZ_TO_KHZ(56160 + chan * 2160); break; case NL80211_BAND_S1GHZ: return 902000 + chan * 500; default: ; } return 0; /* not supported */ } EXPORT_SYMBOL(ieee80211_channel_to_freq_khz); enum nl80211_chan_width ieee80211_s1g_channel_width(const struct ieee80211_channel *chan) { if (WARN_ON(!chan || chan->band != NL80211_BAND_S1GHZ)) return NL80211_CHAN_WIDTH_20_NOHT; /*S1G defines a single allowed channel width per channel. * Extract that width here. */ if (chan->flags & IEEE80211_CHAN_1MHZ) return NL80211_CHAN_WIDTH_1; else if (chan->flags & IEEE80211_CHAN_2MHZ) return NL80211_CHAN_WIDTH_2; else if (chan->flags & IEEE80211_CHAN_4MHZ) return NL80211_CHAN_WIDTH_4; else if (chan->flags & IEEE80211_CHAN_8MHZ) return NL80211_CHAN_WIDTH_8; else if (chan->flags & IEEE80211_CHAN_16MHZ) return NL80211_CHAN_WIDTH_16; pr_err("unknown channel width for channel at %dKHz?\n", ieee80211_channel_to_khz(chan)); return NL80211_CHAN_WIDTH_1; } EXPORT_SYMBOL(ieee80211_s1g_channel_width); int ieee80211_freq_khz_to_channel(u32 freq) { /* TODO: just handle MHz for now */ freq = KHZ_TO_MHZ(freq); /* see 802.11 17.3.8.3.2 and Annex J */ if (freq == 2484) return 14; else if (freq < 2484) return (freq - 2407) / 5; else if (freq >= 4910 && freq <= 4980) return (freq - 4000) / 5; else if (freq < 5925) return (freq - 5000) / 5; else if (freq == 5935) return 2; else if (freq <= 45000) /* DMG band lower limit */ /* see 802.11ax D6.1 27.3.22.2 */ return (freq - 5950) / 5; else if (freq >= 58320 && freq <= 70200) return (freq - 56160) / 2160; else return 0; } EXPORT_SYMBOL(ieee80211_freq_khz_to_channel); struct ieee80211_channel *ieee80211_get_channel_khz(struct wiphy *wiphy, u32 freq) { enum nl80211_band band; struct ieee80211_supported_band *sband; int i; for (band = 0; band < NUM_NL80211_BANDS; band++) { sband = wiphy->bands[band]; if (!sband) continue; for (i = 0; i < sband->n_channels; i++) { struct ieee80211_channel *chan = &sband->channels[i]; if (ieee80211_channel_to_khz(chan) == freq) return chan; } } return NULL; } EXPORT_SYMBOL(ieee80211_get_channel_khz); static void set_mandatory_flags_band(struct ieee80211_supported_band *sband) { int i, want; switch (sband->band) { case NL80211_BAND_5GHZ: case NL80211_BAND_6GHZ: want = 3; for (i = 0; i < sband->n_bitrates; i++) { if (sband->bitrates[i].bitrate == 60 || sband->bitrates[i].bitrate == 120 || sband->bitrates[i].bitrate == 240) { sband->bitrates[i].flags |= IEEE80211_RATE_MANDATORY_A; want--; } } WARN_ON(want); break; case NL80211_BAND_2GHZ: case NL80211_BAND_LC: want = 7; for (i = 0; i < sband->n_bitrates; i++) { switch (sband->bitrates[i].bitrate) { case 10: case 20: case 55: case 110: sband->bitrates[i].flags |= IEEE80211_RATE_MANDATORY_B | IEEE80211_RATE_MANDATORY_G; want--; break; case 60: case 120: case 240: sband->bitrates[i].flags |= IEEE80211_RATE_MANDATORY_G; want--; fallthrough; default: sband->bitrates[i].flags |= IEEE80211_RATE_ERP_G; break; } } WARN_ON(want != 0 && want != 3); break; case NL80211_BAND_60GHZ: /* check for mandatory HT MCS 1..4 */ WARN_ON(!sband->ht_cap.ht_supported); WARN_ON((sband->ht_cap.mcs.rx_mask[0] & 0x1e) != 0x1e); break; case NL80211_BAND_S1GHZ: /* Figure 9-589bd: 3 means unsupported, so != 3 means at least * mandatory is ok. */ WARN_ON((sband->s1g_cap.nss_mcs[0] & 0x3) == 0x3); break; case NUM_NL80211_BANDS: default: WARN_ON(1); break; } } void ieee80211_set_bitrate_flags(struct wiphy *wiphy) { enum nl80211_band band; for (band = 0; band < NUM_NL80211_BANDS; band++) if (wiphy->bands[band]) set_mandatory_flags_band(wiphy->bands[band]); } bool cfg80211_supported_cipher_suite(struct wiphy *wiphy, u32 cipher) { int i; for (i = 0; i < wiphy->n_cipher_suites; i++) if (cipher == wiphy->cipher_suites[i]) return true; return false; } static bool cfg80211_igtk_cipher_supported(struct cfg80211_registered_device *rdev) { struct wiphy *wiphy = &rdev->wiphy; int i; for (i = 0; i < wiphy->n_cipher_suites; i++) { switch (wiphy->cipher_suites[i]) { case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: return true; } } return false; } bool cfg80211_valid_key_idx(struct cfg80211_registered_device *rdev, int key_idx, bool pairwise) { int max_key_idx; if (pairwise) max_key_idx = 3; else if (wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_BEACON_PROTECTION) || wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT)) max_key_idx = 7; else if (cfg80211_igtk_cipher_supported(rdev)) max_key_idx = 5; else max_key_idx = 3; if (key_idx < 0 || key_idx > max_key_idx) return false; return true; } int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, struct key_params *params, int key_idx, bool pairwise, const u8 *mac_addr) { if (!cfg80211_valid_key_idx(rdev, key_idx, pairwise)) return -EINVAL; if (!pairwise && mac_addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN)) return -EINVAL; if (pairwise && !mac_addr) return -EINVAL; switch (params->cipher) { case WLAN_CIPHER_SUITE_TKIP: /* Extended Key ID can only be used with CCMP/GCMP ciphers */ if ((pairwise && key_idx) || params->mode != NL80211_KEY_RX_TX) return -EINVAL; break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: /* IEEE802.11-2016 allows only 0 and - when supporting * Extended Key ID - 1 as index for pairwise keys. * @NL80211_KEY_NO_TX is only allowed for pairwise keys when * the driver supports Extended Key ID. * @NL80211_KEY_SET_TX can't be set when installing and * validating a key. */ if ((params->mode == NL80211_KEY_NO_TX && !pairwise) || params->mode == NL80211_KEY_SET_TX) return -EINVAL; if (wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_EXT_KEY_ID)) { if (pairwise && (key_idx < 0 || key_idx > 1)) return -EINVAL; } else if (pairwise && key_idx) { return -EINVAL; } break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: /* Disallow BIP (group-only) cipher as pairwise cipher */ if (pairwise) return -EINVAL; if (key_idx < 4) return -EINVAL; break; case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: if (key_idx > 3) return -EINVAL; break; default: break; } switch (params->cipher) { case WLAN_CIPHER_SUITE_WEP40: if (params->key_len != WLAN_KEY_LEN_WEP40) return -EINVAL; break; case WLAN_CIPHER_SUITE_TKIP: if (params->key_len != WLAN_KEY_LEN_TKIP) return -EINVAL; break; case WLAN_CIPHER_SUITE_CCMP: if (params->key_len != WLAN_KEY_LEN_CCMP) return -EINVAL; break; case WLAN_CIPHER_SUITE_CCMP_256: if (params->key_len != WLAN_KEY_LEN_CCMP_256) return -EINVAL; break; case WLAN_CIPHER_SUITE_GCMP: if (params->key_len != WLAN_KEY_LEN_GCMP) return -EINVAL; break; case WLAN_CIPHER_SUITE_GCMP_256: if (params->key_len != WLAN_KEY_LEN_GCMP_256) return -EINVAL; break; case WLAN_CIPHER_SUITE_WEP104: if (params->key_len != WLAN_KEY_LEN_WEP104) return -EINVAL; break; case WLAN_CIPHER_SUITE_AES_CMAC: if (params->key_len != WLAN_KEY_LEN_AES_CMAC) return -EINVAL; break; case WLAN_CIPHER_SUITE_BIP_CMAC_256: if (params->key_len != WLAN_KEY_LEN_BIP_CMAC_256) return -EINVAL; break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: if (params->key_len != WLAN_KEY_LEN_BIP_GMAC_128) return -EINVAL; break; case WLAN_CIPHER_SUITE_BIP_GMAC_256: if (params->key_len != WLAN_KEY_LEN_BIP_GMAC_256) return -EINVAL; break; default: /* * We don't know anything about this algorithm, * allow using it -- but the driver must check * all parameters! We still check below whether * or not the driver supports this algorithm, * of course. */ break; } if (params->seq) { switch (params->cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: /* These ciphers do not use key sequence */ return -EINVAL; case WLAN_CIPHER_SUITE_TKIP: case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: if (params->seq_len != 6) return -EINVAL; break; } } if (!cfg80211_supported_cipher_suite(&rdev->wiphy, params->cipher)) return -EINVAL; return 0; } unsigned int __attribute_const__ ieee80211_hdrlen(__le16 fc) { unsigned int hdrlen = 24; if (ieee80211_is_ext(fc)) { hdrlen = 4; goto out; } if (ieee80211_is_data(fc)) { if (ieee80211_has_a4(fc)) hdrlen = 30; if (ieee80211_is_data_qos(fc)) { hdrlen += IEEE80211_QOS_CTL_LEN; if (ieee80211_has_order(fc)) hdrlen += IEEE80211_HT_CTL_LEN; } goto out; } if (ieee80211_is_mgmt(fc)) { if (ieee80211_has_order(fc)) hdrlen += IEEE80211_HT_CTL_LEN; goto out; } if (ieee80211_is_ctl(fc)) { /* * ACK and CTS are 10 bytes, all others 16. To see how * to get this condition consider * subtype mask: 0b0000000011110000 (0x00F0) * ACK subtype: 0b0000000011010000 (0x00D0) * CTS subtype: 0b0000000011000000 (0x00C0) * bits that matter: ^^^ (0x00E0) * value of those: 0b0000000011000000 (0x00C0) */ if ((fc & cpu_to_le16(0x00E0)) == cpu_to_le16(0x00C0)) hdrlen = 10; else hdrlen = 16; } out: return hdrlen; } EXPORT_SYMBOL(ieee80211_hdrlen); unsigned int ieee80211_get_hdrlen_from_skb(const struct sk_buff *skb) { const struct ieee80211_hdr *hdr = (const struct ieee80211_hdr *)skb->data; unsigned int hdrlen; if (unlikely(skb->len < 10)) return 0; hdrlen = ieee80211_hdrlen(hdr->frame_control); if (unlikely(hdrlen > skb->len)) return 0; return hdrlen; } EXPORT_SYMBOL(ieee80211_get_hdrlen_from_skb); static unsigned int __ieee80211_get_mesh_hdrlen(u8 flags) { int ae = flags & MESH_FLAGS_AE; /* 802.11-2012, 8.2.4.7.3 */ switch (ae) { default: case 0: return 6; case MESH_FLAGS_AE_A4: return 12; case MESH_FLAGS_AE_A5_A6: return 18; } } unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr) { return __ieee80211_get_mesh_hdrlen(meshhdr->flags); } EXPORT_SYMBOL(ieee80211_get_mesh_hdrlen); bool ieee80211_get_8023_tunnel_proto(const void *hdr, __be16 *proto) { const __be16 *hdr_proto = hdr + ETH_ALEN; if (!(ether_addr_equal(hdr, rfc1042_header) && *hdr_proto != htons(ETH_P_AARP) && *hdr_proto != htons(ETH_P_IPX)) && !ether_addr_equal(hdr, bridge_tunnel_header)) return false; *proto = *hdr_proto; return true; } EXPORT_SYMBOL(ieee80211_get_8023_tunnel_proto); int ieee80211_strip_8023_mesh_hdr(struct sk_buff *skb) { const void *mesh_addr; struct { struct ethhdr eth; u8 flags; } payload; int hdrlen; int ret; ret = skb_copy_bits(skb, 0, &payload, sizeof(payload)); if (ret) return ret; hdrlen = sizeof(payload.eth) + __ieee80211_get_mesh_hdrlen(payload.flags); if (likely(pskb_may_pull(skb, hdrlen + 8) && ieee80211_get_8023_tunnel_proto(skb->data + hdrlen, &payload.eth.h_proto))) hdrlen += ETH_ALEN + 2; else if (!pskb_may_pull(skb, hdrlen)) return -EINVAL; else payload.eth.h_proto = htons(skb->len - hdrlen); mesh_addr = skb->data + sizeof(payload.eth) + ETH_ALEN; switch (payload.flags & MESH_FLAGS_AE) { case MESH_FLAGS_AE_A4: memcpy(&payload.eth.h_source, mesh_addr, ETH_ALEN); break; case MESH_FLAGS_AE_A5_A6: memcpy(&payload.eth, mesh_addr, 2 * ETH_ALEN); break; default: break; } pskb_pull(skb, hdrlen - sizeof(payload.eth)); memcpy(skb->data, &payload.eth, sizeof(payload.eth)); return 0; } EXPORT_SYMBOL(ieee80211_strip_8023_mesh_hdr); int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, const u8 *addr, enum nl80211_iftype iftype, u8 data_offset, bool is_amsdu) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct { u8 hdr[ETH_ALEN] __aligned(2); __be16 proto; } payload; struct ethhdr tmp; u16 hdrlen; if (unlikely(!ieee80211_is_data_present(hdr->frame_control))) return -1; hdrlen = ieee80211_hdrlen(hdr->frame_control) + data_offset; if (skb->len < hdrlen) return -1; /* convert IEEE 802.11 header + possible LLC headers into Ethernet * header * IEEE 802.11 address fields: * ToDS FromDS Addr1 Addr2 Addr3 Addr4 * 0 0 DA SA BSSID n/a * 0 1 DA BSSID SA n/a * 1 0 BSSID SA DA n/a * 1 1 RA TA DA SA */ memcpy(tmp.h_dest, ieee80211_get_DA(hdr), ETH_ALEN); memcpy(tmp.h_source, ieee80211_get_SA(hdr), ETH_ALEN); switch (hdr->frame_control & cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) { case cpu_to_le16(IEEE80211_FCTL_TODS): if (unlikely(iftype != NL80211_IFTYPE_AP && iftype != NL80211_IFTYPE_AP_VLAN && iftype != NL80211_IFTYPE_P2P_GO)) return -1; break; case cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS): if (unlikely(iftype != NL80211_IFTYPE_MESH_POINT && iftype != NL80211_IFTYPE_AP_VLAN && iftype != NL80211_IFTYPE_STATION)) return -1; break; case cpu_to_le16(IEEE80211_FCTL_FROMDS): if ((iftype != NL80211_IFTYPE_STATION && iftype != NL80211_IFTYPE_P2P_CLIENT && iftype != NL80211_IFTYPE_MESH_POINT) || (is_multicast_ether_addr(tmp.h_dest) && ether_addr_equal(tmp.h_source, addr))) return -1; break; case cpu_to_le16(0): if (iftype != NL80211_IFTYPE_ADHOC && iftype != NL80211_IFTYPE_STATION && iftype != NL80211_IFTYPE_OCB) return -1; break; } if (likely(!is_amsdu && iftype != NL80211_IFTYPE_MESH_POINT && skb_copy_bits(skb, hdrlen, &payload, sizeof(payload)) == 0 && ieee80211_get_8023_tunnel_proto(&payload, &tmp.h_proto))) { /* remove RFC1042 or Bridge-Tunnel encapsulation */ hdrlen += ETH_ALEN + 2; skb_postpull_rcsum(skb, &payload, ETH_ALEN + 2); } else { tmp.h_proto = htons(skb->len - hdrlen); } pskb_pull(skb, hdrlen); if (!ehdr) ehdr = skb_push(skb, sizeof(struct ethhdr)); memcpy(ehdr, &tmp, sizeof(tmp)); return 0; } EXPORT_SYMBOL(ieee80211_data_to_8023_exthdr); static void __frame_add_frag(struct sk_buff *skb, struct page *page, void *ptr, int len, int size) { struct skb_shared_info *sh = skb_shinfo(skb); int page_offset; get_page(page); page_offset = ptr - page_address(page); skb_add_rx_frag(skb, sh->nr_frags, page, page_offset, len, size); } static void __ieee80211_amsdu_copy_frag(struct sk_buff *skb, struct sk_buff *frame, int offset, int len) { struct skb_shared_info *sh = skb_shinfo(skb); const skb_frag_t *frag = &sh->frags[0]; struct page *frag_page; void *frag_ptr; int frag_len, frag_size; int head_size = skb->len - skb->data_len; int cur_len; frag_page = virt_to_head_page(skb->head); frag_ptr = skb->data; frag_size = head_size; while (offset >= frag_size) { offset -= frag_size; frag_page = skb_frag_page(frag); frag_ptr = skb_frag_address(frag); frag_size = skb_frag_size(frag); frag++; } frag_ptr += offset; frag_len = frag_size - offset; cur_len = min(len, frag_len); __frame_add_frag(frame, frag_page, frag_ptr, cur_len, frag_size); len -= cur_len; while (len > 0) { frag_len = skb_frag_size(frag); cur_len = min(len, frag_len); __frame_add_frag(frame, skb_frag_page(frag), skb_frag_address(frag), cur_len, frag_len); len -= cur_len; frag++; } } static struct sk_buff * __ieee80211_amsdu_copy(struct sk_buff *skb, unsigned int hlen, int offset, int len, bool reuse_frag, int min_len) { struct sk_buff *frame; int cur_len = len; if (skb->len - offset < len) return NULL; /* * When reusing framents, copy some data to the head to simplify * ethernet header handling and speed up protocol header processing * in the stack later. */ if (reuse_frag) cur_len = min_t(int, len, min_len); /* * Allocate and reserve two bytes more for payload * alignment since sizeof(struct ethhdr) is 14. */ frame = dev_alloc_skb(hlen + sizeof(struct ethhdr) + 2 + cur_len); if (!frame) return NULL; frame->priority = skb->priority; skb_reserve(frame, hlen + sizeof(struct ethhdr) + 2); skb_copy_bits(skb, offset, skb_put(frame, cur_len), cur_len); len -= cur_len; if (!len) return frame; offset += cur_len; __ieee80211_amsdu_copy_frag(skb, frame, offset, len); return frame; } static u16 ieee80211_amsdu_subframe_length(void *field, u8 mesh_flags, u8 hdr_type) { __le16 *field_le = field; __be16 *field_be = field; u16 len; if (hdr_type >= 2) len = le16_to_cpu(*field_le); else len = be16_to_cpu(*field_be); if (hdr_type) len += __ieee80211_get_mesh_hdrlen(mesh_flags); return len; } bool ieee80211_is_valid_amsdu(struct sk_buff *skb, u8 mesh_hdr) { int offset = 0, subframe_len, padding; for (offset = 0; offset < skb->len; offset += subframe_len + padding) { int remaining = skb->len - offset; struct { __be16 len; u8 mesh_flags; } hdr; u16 len; if (sizeof(hdr) > remaining) return false; if (skb_copy_bits(skb, offset + 2 * ETH_ALEN, &hdr, sizeof(hdr)) < 0) return false; len = ieee80211_amsdu_subframe_length(&hdr.len, hdr.mesh_flags, mesh_hdr); subframe_len = sizeof(struct ethhdr) + len; padding = (4 - subframe_len) & 0x3; if (subframe_len > remaining) return false; } return true; } EXPORT_SYMBOL(ieee80211_is_valid_amsdu); void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list, const u8 *addr, enum nl80211_iftype iftype, const unsigned int extra_headroom, const u8 *check_da, const u8 *check_sa, u8 mesh_control) { unsigned int hlen = ALIGN(extra_headroom, 4); struct sk_buff *frame = NULL; int offset = 0; struct { struct ethhdr eth; uint8_t flags; } hdr; bool reuse_frag = skb->head_frag && !skb_has_frag_list(skb); bool reuse_skb = false; bool last = false; int copy_len = sizeof(hdr.eth); if (iftype == NL80211_IFTYPE_MESH_POINT) copy_len = sizeof(hdr); while (!last) { int remaining = skb->len - offset; unsigned int subframe_len; int len, mesh_len = 0; u8 padding; if (copy_len > remaining) goto purge; skb_copy_bits(skb, offset, &hdr, copy_len); if (iftype == NL80211_IFTYPE_MESH_POINT) mesh_len = __ieee80211_get_mesh_hdrlen(hdr.flags); len = ieee80211_amsdu_subframe_length(&hdr.eth.h_proto, hdr.flags, mesh_control); subframe_len = sizeof(struct ethhdr) + len; padding = (4 - subframe_len) & 0x3; /* the last MSDU has no padding */ if (subframe_len > remaining) goto purge; /* mitigate A-MSDU aggregation injection attacks */ if (ether_addr_equal(hdr.eth.h_dest, rfc1042_header)) goto purge; offset += sizeof(struct ethhdr); last = remaining <= subframe_len + padding; /* FIXME: should we really accept multicast DA? */ if ((check_da && !is_multicast_ether_addr(hdr.eth.h_dest) && !ether_addr_equal(check_da, hdr.eth.h_dest)) || (check_sa && !ether_addr_equal(check_sa, hdr.eth.h_source))) { offset += len + padding; continue; } /* reuse skb for the last subframe */ if (!skb_is_nonlinear(skb) && !reuse_frag && last) { skb_pull(skb, offset); frame = skb; reuse_skb = true; } else { frame = __ieee80211_amsdu_copy(skb, hlen, offset, len, reuse_frag, 32 + mesh_len); if (!frame) goto purge; offset += len + padding; } skb_reset_network_header(frame); frame->dev = skb->dev; frame->priority = skb->priority; if (likely(iftype != NL80211_IFTYPE_MESH_POINT && ieee80211_get_8023_tunnel_proto(frame->data, &hdr.eth.h_proto))) skb_pull(frame, ETH_ALEN + 2); memcpy(skb_push(frame, sizeof(hdr.eth)), &hdr.eth, sizeof(hdr.eth)); __skb_queue_tail(list, frame); } if (!reuse_skb) dev_kfree_skb(skb); return; purge: __skb_queue_purge(list); dev_kfree_skb(skb); } EXPORT_SYMBOL(ieee80211_amsdu_to_8023s); /* Given a data frame determine the 802.1p/1d tag to use. */ unsigned int cfg80211_classify8021d(struct sk_buff *skb, struct cfg80211_qos_map *qos_map) { unsigned int dscp; unsigned char vlan_priority; unsigned int ret; /* skb->priority values from 256->263 are magic values to * directly indicate a specific 802.1d priority. This is used * to allow 802.1d priority to be passed directly in from VLAN * tags, etc. */ if (skb->priority >= 256 && skb->priority <= 263) { ret = skb->priority - 256; goto out; } if (skb_vlan_tag_present(skb)) { vlan_priority = (skb_vlan_tag_get(skb) & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; if (vlan_priority > 0) { ret = vlan_priority; goto out; } } switch (skb->protocol) { case htons(ETH_P_IP): dscp = ipv4_get_dsfield(ip_hdr(skb)) & 0xfc; break; case htons(ETH_P_IPV6): dscp = ipv6_get_dsfield(ipv6_hdr(skb)) & 0xfc; break; case htons(ETH_P_MPLS_UC): case htons(ETH_P_MPLS_MC): { struct mpls_label mpls_tmp, *mpls; mpls = skb_header_pointer(skb, sizeof(struct ethhdr), sizeof(*mpls), &mpls_tmp); if (!mpls) return 0; ret = (ntohl(mpls->entry) & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT; goto out; } case htons(ETH_P_80221): /* 802.21 is always network control traffic */ return 7; default: return 0; } if (qos_map) { unsigned int i, tmp_dscp = dscp >> 2; for (i = 0; i < qos_map->num_des; i++) { if (tmp_dscp == qos_map->dscp_exception[i].dscp) { ret = qos_map->dscp_exception[i].up; goto out; } } for (i = 0; i < 8; i++) { if (tmp_dscp >= qos_map->up[i].low && tmp_dscp <= qos_map->up[i].high) { ret = i; goto out; } } } /* The default mapping as defined Section 2.3 in RFC8325: The three * Most Significant Bits (MSBs) of the DSCP are used as the * corresponding L2 markings. */ ret = dscp >> 5; /* Handle specific DSCP values for which the default mapping (as * described above) doesn't adhere to the intended usage of the DSCP * value. See section 4 in RFC8325. Specifically, for the following * Diffserv Service Classes no update is needed: * - Standard: DF * - Low Priority Data: CS1 * - Multimedia Streaming: AF31, AF32, AF33 * - Multimedia Conferencing: AF41, AF42, AF43 * - Network Control Traffic: CS7 * - Real-Time Interactive: CS4 */ switch (dscp >> 2) { case 10: case 12: case 14: /* High throughput data: AF11, AF12, AF13 */ ret = 0; break; case 16: /* Operations, Administration, and Maintenance and Provisioning: * CS2 */ ret = 0; break; case 18: case 20: case 22: /* Low latency data: AF21, AF22, AF23 */ ret = 3; break; case 24: /* Broadcasting video: CS3 */ ret = 4; break; case 40: /* Signaling: CS5 */ ret = 5; break; case 44: /* Voice Admit: VA */ ret = 6; break; case 46: /* Telephony traffic: EF */ ret = 6; break; case 48: /* Network Control Traffic: CS6 */ ret = 7; break; } out: return array_index_nospec(ret, IEEE80211_NUM_TIDS); } EXPORT_SYMBOL(cfg80211_classify8021d); const struct element *ieee80211_bss_get_elem(struct cfg80211_bss *bss, u8 id) { const struct cfg80211_bss_ies *ies; ies = rcu_dereference(bss->ies); if (!ies) return NULL; return cfg80211_find_elem(id, ies->data, ies->len); } EXPORT_SYMBOL(ieee80211_bss_get_elem); void cfg80211_upload_connect_keys(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct net_device *dev = wdev->netdev; int i; if (!wdev->connect_keys) return; for (i = 0; i < 4; i++) { if (!wdev->connect_keys->params[i].cipher) continue; if (rdev_add_key(rdev, dev, -1, i, false, NULL, &wdev->connect_keys->params[i])) { netdev_err(dev, "failed to set key %d\n", i); continue; } if (wdev->connect_keys->def == i && rdev_set_default_key(rdev, dev, -1, i, true, true)) { netdev_err(dev, "failed to set defkey %d\n", i); continue; } } kfree_sensitive(wdev->connect_keys); wdev->connect_keys = NULL; } void cfg80211_process_wdev_events(struct wireless_dev *wdev) { struct cfg80211_event *ev; unsigned long flags; spin_lock_irqsave(&wdev->event_lock, flags); while (!list_empty(&wdev->event_list)) { ev = list_first_entry(&wdev->event_list, struct cfg80211_event, list); list_del(&ev->list); spin_unlock_irqrestore(&wdev->event_lock, flags); switch (ev->type) { case EVENT_CONNECT_RESULT: __cfg80211_connect_result( wdev->netdev, &ev->cr, ev->cr.status == WLAN_STATUS_SUCCESS); break; case EVENT_ROAMED: __cfg80211_roamed(wdev, &ev->rm); break; case EVENT_DISCONNECTED: __cfg80211_disconnected(wdev->netdev, ev->dc.ie, ev->dc.ie_len, ev->dc.reason, !ev->dc.locally_generated); break; case EVENT_IBSS_JOINED: __cfg80211_ibss_joined(wdev->netdev, ev->ij.bssid, ev->ij.channel); break; case EVENT_STOPPED: cfg80211_leave(wiphy_to_rdev(wdev->wiphy), wdev); break; case EVENT_PORT_AUTHORIZED: __cfg80211_port_authorized(wdev, ev->pa.peer_addr, ev->pa.td_bitmap, ev->pa.td_bitmap_len); break; } kfree(ev); spin_lock_irqsave(&wdev->event_lock, flags); } spin_unlock_irqrestore(&wdev->event_lock, flags); } void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev) { struct wireless_dev *wdev; lockdep_assert_held(&rdev->wiphy.mtx); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) cfg80211_process_wdev_events(wdev); } int cfg80211_change_iface(struct cfg80211_registered_device *rdev, struct net_device *dev, enum nl80211_iftype ntype, struct vif_params *params) { int err; enum nl80211_iftype otype = dev->ieee80211_ptr->iftype; lockdep_assert_held(&rdev->wiphy.mtx); /* don't support changing VLANs, you just re-create them */ if (otype == NL80211_IFTYPE_AP_VLAN) return -EOPNOTSUPP; /* cannot change into P2P device or NAN */ if (ntype == NL80211_IFTYPE_P2P_DEVICE || ntype == NL80211_IFTYPE_NAN) return -EOPNOTSUPP; if (!rdev->ops->change_virtual_intf || !(rdev->wiphy.interface_modes & (1 << ntype))) return -EOPNOTSUPP; if (ntype != otype) { /* if it's part of a bridge, reject changing type to station/ibss */ if (netif_is_bridge_port(dev) && (ntype == NL80211_IFTYPE_ADHOC || ntype == NL80211_IFTYPE_STATION || ntype == NL80211_IFTYPE_P2P_CLIENT)) return -EBUSY; dev->ieee80211_ptr->use_4addr = false; rdev_set_qos_map(rdev, dev, NULL); switch (otype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: cfg80211_stop_ap(rdev, dev, -1, true); break; case NL80211_IFTYPE_ADHOC: cfg80211_leave_ibss(rdev, dev, false); break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: cfg80211_disconnect(rdev, dev, WLAN_REASON_DEAUTH_LEAVING, true); break; case NL80211_IFTYPE_MESH_POINT: /* mesh should be handled? */ break; case NL80211_IFTYPE_OCB: cfg80211_leave_ocb(rdev, dev); break; default: break; } cfg80211_process_rdev_events(rdev); cfg80211_mlme_purge_registrations(dev->ieee80211_ptr); memset(&dev->ieee80211_ptr->u, 0, sizeof(dev->ieee80211_ptr->u)); memset(&dev->ieee80211_ptr->links, 0, sizeof(dev->ieee80211_ptr->links)); } err = rdev_change_virtual_intf(rdev, dev, ntype, params); WARN_ON(!err && dev->ieee80211_ptr->iftype != ntype); if (!err && params && params->use_4addr != -1) dev->ieee80211_ptr->use_4addr = params->use_4addr; if (!err) { dev->priv_flags &= ~IFF_DONT_BRIDGE; switch (ntype) { case NL80211_IFTYPE_STATION: if (dev->ieee80211_ptr->use_4addr) break; fallthrough; case NL80211_IFTYPE_OCB: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_ADHOC: dev->priv_flags |= IFF_DONT_BRIDGE; break; case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_MESH_POINT: /* bridging OK */ break; case NL80211_IFTYPE_MONITOR: /* monitor can't bridge anyway */ break; case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: /* not happening */ break; case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_NAN: WARN_ON(1); break; } } if (!err && ntype != otype && netif_running(dev)) { cfg80211_update_iface_num(rdev, ntype, 1); cfg80211_update_iface_num(rdev, otype, -1); } return err; } static u32 cfg80211_calculate_bitrate_ht(struct rate_info *rate) { int modulation, streams, bitrate; /* the formula below does only work for MCS values smaller than 32 */ if (WARN_ON_ONCE(rate->mcs >= 32)) return 0; modulation = rate->mcs & 7; streams = (rate->mcs >> 3) + 1; bitrate = (rate->bw == RATE_INFO_BW_40) ? 13500000 : 6500000; if (modulation < 4) bitrate *= (modulation + 1); else if (modulation == 4) bitrate *= (modulation + 2); else bitrate *= (modulation + 3); bitrate *= streams; if (rate->flags & RATE_INFO_FLAGS_SHORT_GI) bitrate = (bitrate / 9) * 10; /* do NOT round down here */ return (bitrate + 50000) / 100000; } static u32 cfg80211_calculate_bitrate_dmg(struct rate_info *rate) { static const u32 __mcs2bitrate[] = { /* control PHY */ [0] = 275, /* SC PHY */ [1] = 3850, [2] = 7700, [3] = 9625, [4] = 11550, [5] = 12512, /* 1251.25 mbps */ [6] = 15400, [7] = 19250, [8] = 23100, [9] = 25025, [10] = 30800, [11] = 38500, [12] = 46200, /* OFDM PHY */ [13] = 6930, [14] = 8662, /* 866.25 mbps */ [15] = 13860, [16] = 17325, [17] = 20790, [18] = 27720, [19] = 34650, [20] = 41580, [21] = 45045, [22] = 51975, [23] = 62370, [24] = 67568, /* 6756.75 mbps */ /* LP-SC PHY */ [25] = 6260, [26] = 8340, [27] = 11120, [28] = 12510, [29] = 16680, [30] = 22240, [31] = 25030, }; if (WARN_ON_ONCE(rate->mcs >= ARRAY_SIZE(__mcs2bitrate))) return 0; return __mcs2bitrate[rate->mcs]; } static u32 cfg80211_calculate_bitrate_extended_sc_dmg(struct rate_info *rate) { static const u32 __mcs2bitrate[] = { [6 - 6] = 26950, /* MCS 9.1 : 2695.0 mbps */ [7 - 6] = 50050, /* MCS 12.1 */ [8 - 6] = 53900, [9 - 6] = 57750, [10 - 6] = 63900, [11 - 6] = 75075, [12 - 6] = 80850, }; /* Extended SC MCS not defined for base MCS below 6 or above 12 */ if (WARN_ON_ONCE(rate->mcs < 6 || rate->mcs > 12)) return 0; return __mcs2bitrate[rate->mcs - 6]; } static u32 cfg80211_calculate_bitrate_edmg(struct rate_info *rate) { static const u32 __mcs2bitrate[] = { /* control PHY */ [0] = 275, /* SC PHY */ [1] = 3850, [2] = 7700, [3] = 9625, [4] = 11550, [5] = 12512, /* 1251.25 mbps */ [6] = 13475, [7] = 15400, [8] = 19250, [9] = 23100, [10] = 25025, [11] = 26950, [12] = 30800, [13] = 38500, [14] = 46200, [15] = 50050, [16] = 53900, [17] = 57750, [18] = 69300, [19] = 75075, [20] = 80850, }; if (WARN_ON_ONCE(rate->mcs >= ARRAY_SIZE(__mcs2bitrate))) return 0; return __mcs2bitrate[rate->mcs] * rate->n_bonded_ch; } static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate) { static const u32 base[4][12] = { { 6500000, 13000000, 19500000, 26000000, 39000000, 52000000, 58500000, 65000000, 78000000, /* not in the spec, but some devices use this: */ 86700000, 97500000, 108300000, }, { 13500000, 27000000, 40500000, 54000000, 81000000, 108000000, 121500000, 135000000, 162000000, 180000000, 202500000, 225000000, }, { 29300000, 58500000, 87800000, 117000000, 175500000, 234000000, 263300000, 292500000, 351000000, 390000000, 438800000, 487500000, }, { 58500000, 117000000, 175500000, 234000000, 351000000, 468000000, 526500000, 585000000, 702000000, 780000000, 877500000, 975000000, }, }; u32 bitrate; int idx; if (rate->mcs > 11) goto warn; switch (rate->bw) { case RATE_INFO_BW_160: idx = 3; break; case RATE_INFO_BW_80: idx = 2; break; case RATE_INFO_BW_40: idx = 1; break; case RATE_INFO_BW_5: case RATE_INFO_BW_10: default: goto warn; case RATE_INFO_BW_20: idx = 0; } bitrate = base[idx][rate->mcs]; bitrate *= rate->nss; if (rate->flags & RATE_INFO_FLAGS_SHORT_GI) bitrate = (bitrate / 9) * 10; /* do NOT round down here */ return (bitrate + 50000) / 100000; warn: WARN_ONCE(1, "invalid rate bw=%d, mcs=%d, nss=%d\n", rate->bw, rate->mcs, rate->nss); return 0; } static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate) { #define SCALE 6144 u32 mcs_divisors[14] = { 102399, /* 16.666666... */ 51201, /* 8.333333... */ 34134, /* 5.555555... */ 25599, /* 4.166666... */ 17067, /* 2.777777... */ 12801, /* 2.083333... */ 11377, /* 1.851725... */ 10239, /* 1.666666... */ 8532, /* 1.388888... */ 7680, /* 1.250000... */ 6828, /* 1.111111... */ 6144, /* 1.000000... */ 5690, /* 0.926106... */ 5120, /* 0.833333... */ }; u32 rates_160M[3] = { 960777777, 907400000, 816666666 }; u32 rates_969[3] = { 480388888, 453700000, 408333333 }; u32 rates_484[3] = { 229411111, 216666666, 195000000 }; u32 rates_242[3] = { 114711111, 108333333, 97500000 }; u32 rates_106[3] = { 40000000, 37777777, 34000000 }; u32 rates_52[3] = { 18820000, 17777777, 16000000 }; u32 rates_26[3] = { 9411111, 8888888, 8000000 }; u64 tmp; u32 result; if (WARN_ON_ONCE(rate->mcs > 13)) return 0; if (WARN_ON_ONCE(rate->he_gi > NL80211_RATE_INFO_HE_GI_3_2)) return 0; if (WARN_ON_ONCE(rate->he_ru_alloc > NL80211_RATE_INFO_HE_RU_ALLOC_2x996)) return 0; if (WARN_ON_ONCE(rate->nss < 1 || rate->nss > 8)) return 0; if (rate->bw == RATE_INFO_BW_160) result = rates_160M[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_80 || (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_996)) result = rates_969[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_40 || (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_484)) result = rates_484[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_20 || (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_242)) result = rates_242[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_106) result = rates_106[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_52) result = rates_52[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_26) result = rates_26[rate->he_gi]; else { WARN(1, "invalid HE MCS: bw:%d, ru:%d\n", rate->bw, rate->he_ru_alloc); return 0; } /* now scale to the appropriate MCS */ tmp = result; tmp *= SCALE; do_div(tmp, mcs_divisors[rate->mcs]); result = tmp; /* and take NSS, DCM into account */ result = (result * rate->nss) / 8; if (rate->he_dcm) result /= 2; return result / 10000; } static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate) { #define SCALE 6144 static const u32 mcs_divisors[16] = { 102399, /* 16.666666... */ 51201, /* 8.333333... */ 34134, /* 5.555555... */ 25599, /* 4.166666... */ 17067, /* 2.777777... */ 12801, /* 2.083333... */ 11377, /* 1.851725... */ 10239, /* 1.666666... */ 8532, /* 1.388888... */ 7680, /* 1.250000... */ 6828, /* 1.111111... */ 6144, /* 1.000000... */ 5690, /* 0.926106... */ 5120, /* 0.833333... */ 409600, /* 66.666666... */ 204800, /* 33.333333... */ }; static const u32 rates_996[3] = { 480388888, 453700000, 408333333 }; static const u32 rates_484[3] = { 229411111, 216666666, 195000000 }; static const u32 rates_242[3] = { 114711111, 108333333, 97500000 }; static const u32 rates_106[3] = { 40000000, 37777777, 34000000 }; static const u32 rates_52[3] = { 18820000, 17777777, 16000000 }; static const u32 rates_26[3] = { 9411111, 8888888, 8000000 }; u64 tmp; u32 result; if (WARN_ON_ONCE(rate->mcs > 15)) return 0; if (WARN_ON_ONCE(rate->eht_gi > NL80211_RATE_INFO_EHT_GI_3_2)) return 0; if (WARN_ON_ONCE(rate->eht_ru_alloc > NL80211_RATE_INFO_EHT_RU_ALLOC_4x996)) return 0; if (WARN_ON_ONCE(rate->nss < 1 || rate->nss > 8)) return 0; /* Bandwidth checks for MCS 14 */ if (rate->mcs == 14) { if ((rate->bw != RATE_INFO_BW_EHT_RU && rate->bw != RATE_INFO_BW_80 && rate->bw != RATE_INFO_BW_160 && rate->bw != RATE_INFO_BW_320) || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc != NL80211_RATE_INFO_EHT_RU_ALLOC_996 && rate->eht_ru_alloc != NL80211_RATE_INFO_EHT_RU_ALLOC_2x996 && rate->eht_ru_alloc != NL80211_RATE_INFO_EHT_RU_ALLOC_4x996)) { WARN(1, "invalid EHT BW for MCS 14: bw:%d, ru:%d\n", rate->bw, rate->eht_ru_alloc); return 0; } } if (rate->bw == RATE_INFO_BW_320 || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_4x996)) result = 4 * rates_996[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_3x996P484) result = 3 * rates_996[rate->eht_gi] + rates_484[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_3x996) result = 3 * rates_996[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_2x996P484) result = 2 * rates_996[rate->eht_gi] + rates_484[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_160 || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_2x996)) result = 2 * rates_996[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_996P484P242) result = rates_996[rate->eht_gi] + rates_484[rate->eht_gi] + rates_242[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_996P484) result = rates_996[rate->eht_gi] + rates_484[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_80 || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_996)) result = rates_996[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_484P242) result = rates_484[rate->eht_gi] + rates_242[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_40 || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_484)) result = rates_484[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_20 || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_242)) result = rates_242[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_106P26) result = rates_106[rate->eht_gi] + rates_26[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_106) result = rates_106[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_52P26) result = rates_52[rate->eht_gi] + rates_26[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_52) result = rates_52[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_26) result = rates_26[rate->eht_gi]; else { WARN(1, "invalid EHT MCS: bw:%d, ru:%d\n", rate->bw, rate->eht_ru_alloc); return 0; } /* now scale to the appropriate MCS */ tmp = result; tmp *= SCALE; do_div(tmp, mcs_divisors[rate->mcs]); /* and take NSS */ tmp *= rate->nss; do_div(tmp, 8); result = tmp; return result / 10000; } static u32 cfg80211_calculate_bitrate_s1g(struct rate_info *rate) { /* For 1, 2, 4, 8 and 16 MHz channels */ static const u32 base[5][11] = { { 300000, 600000, 900000, 1200000, 1800000, 2400000, 2700000, 3000000, 3600000, 4000000, /* MCS 10 supported in 1 MHz only */ 150000, }, { 650000, 1300000, 1950000, 2600000, 3900000, 5200000, 5850000, 6500000, 7800000, /* MCS 9 not valid */ }, { 1350000, 2700000, 4050000, 5400000, 8100000, 10800000, 12150000, 13500000, 16200000, 18000000, }, { 2925000, 5850000, 8775000, 11700000, 17550000, 23400000, 26325000, 29250000, 35100000, 39000000, }, { 8580000, 11700000, 17550000, 23400000, 35100000, 46800000, 52650000, 58500000, 70200000, 78000000, }, }; u32 bitrate; /* default is 1 MHz index */ int idx = 0; if (rate->mcs >= 11) goto warn; switch (rate->bw) { case RATE_INFO_BW_16: idx = 4; break; case RATE_INFO_BW_8: idx = 3; break; case RATE_INFO_BW_4: idx = 2; break; case RATE_INFO_BW_2: idx = 1; break; case RATE_INFO_BW_1: idx = 0; break; case RATE_INFO_BW_5: case RATE_INFO_BW_10: case RATE_INFO_BW_20: case RATE_INFO_BW_40: case RATE_INFO_BW_80: case RATE_INFO_BW_160: default: goto warn; } bitrate = base[idx][rate->mcs]; bitrate *= rate->nss; if (rate->flags & RATE_INFO_FLAGS_SHORT_GI) bitrate = (bitrate / 9) * 10; /* do NOT round down here */ return (bitrate + 50000) / 100000; warn: WARN_ONCE(1, "invalid rate bw=%d, mcs=%d, nss=%d\n", rate->bw, rate->mcs, rate->nss); return 0; } u32 cfg80211_calculate_bitrate(struct rate_info *rate) { if (rate->flags & RATE_INFO_FLAGS_MCS) return cfg80211_calculate_bitrate_ht(rate); if (rate->flags & RATE_INFO_FLAGS_DMG) return cfg80211_calculate_bitrate_dmg(rate); if (rate->flags & RATE_INFO_FLAGS_EXTENDED_SC_DMG) return cfg80211_calculate_bitrate_extended_sc_dmg(rate); if (rate->flags & RATE_INFO_FLAGS_EDMG) return cfg80211_calculate_bitrate_edmg(rate); if (rate->flags & RATE_INFO_FLAGS_VHT_MCS) return cfg80211_calculate_bitrate_vht(rate); if (rate->flags & RATE_INFO_FLAGS_HE_MCS) return cfg80211_calculate_bitrate_he(rate); if (rate->flags & RATE_INFO_FLAGS_EHT_MCS) return cfg80211_calculate_bitrate_eht(rate); if (rate->flags & RATE_INFO_FLAGS_S1G_MCS) return cfg80211_calculate_bitrate_s1g(rate); return rate->legacy; } EXPORT_SYMBOL(cfg80211_calculate_bitrate); int cfg80211_get_p2p_attr(const u8 *ies, unsigned int len, enum ieee80211_p2p_attr_id attr, u8 *buf, unsigned int bufsize) { u8 *out = buf; u16 attr_remaining = 0; bool desired_attr = false; u16 desired_len = 0; while (len > 0) { unsigned int iedatalen; unsigned int copy; const u8 *iedata; if (len < 2) return -EILSEQ; iedatalen = ies[1]; if (iedatalen + 2 > len) return -EILSEQ; if (ies[0] != WLAN_EID_VENDOR_SPECIFIC) goto cont; if (iedatalen < 4) goto cont; iedata = ies + 2; /* check WFA OUI, P2P subtype */ if (iedata[0] != 0x50 || iedata[1] != 0x6f || iedata[2] != 0x9a || iedata[3] != 0x09) goto cont; iedatalen -= 4; iedata += 4; /* check attribute continuation into this IE */ copy = min_t(unsigned int, attr_remaining, iedatalen); if (copy && desired_attr) { desired_len += copy; if (out) { memcpy(out, iedata, min(bufsize, copy)); out += min(bufsize, copy); bufsize -= min(bufsize, copy); } if (copy == attr_remaining) return desired_len; } attr_remaining -= copy; if (attr_remaining) goto cont; iedatalen -= copy; iedata += copy; while (iedatalen > 0) { u16 attr_len; /* P2P attribute ID & size must fit */ if (iedatalen < 3) return -EILSEQ; desired_attr = iedata[0] == attr; attr_len = get_unaligned_le16(iedata + 1); iedatalen -= 3; iedata += 3; copy = min_t(unsigned int, attr_len, iedatalen); if (desired_attr) { desired_len += copy; if (out) { memcpy(out, iedata, min(bufsize, copy)); out += min(bufsize, copy); bufsize -= min(bufsize, copy); } if (copy == attr_len) return desired_len; } iedata += copy; iedatalen -= copy; attr_remaining = attr_len - copy; } cont: len -= ies[1] + 2; ies += ies[1] + 2; } if (attr_remaining && desired_attr) return -EILSEQ; return -ENOENT; } EXPORT_SYMBOL(cfg80211_get_p2p_attr); static bool ieee80211_id_in_list(const u8 *ids, int n_ids, u8 id, bool id_ext) { int i; /* Make sure array values are legal */ if (WARN_ON(ids[n_ids - 1] == WLAN_EID_EXTENSION)) return false; i = 0; while (i < n_ids) { if (ids[i] == WLAN_EID_EXTENSION) { if (id_ext && (ids[i + 1] == id)) return true; i += 2; continue; } if (ids[i] == id && !id_ext) return true; i++; } return false; } static size_t skip_ie(const u8 *ies, size_t ielen, size_t pos) { /* we assume a validly formed IEs buffer */ u8 len = ies[pos + 1]; pos += 2 + len; /* the IE itself must have 255 bytes for fragments to follow */ if (len < 255) return pos; while (pos < ielen && ies[pos] == WLAN_EID_FRAGMENT) { len = ies[pos + 1]; pos += 2 + len; } return pos; } size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen, const u8 *ids, int n_ids, const u8 *after_ric, int n_after_ric, size_t offset) { size_t pos = offset; while (pos < ielen) { u8 ext = 0; if (ies[pos] == WLAN_EID_EXTENSION) ext = 2; if ((pos + ext) >= ielen) break; if (!ieee80211_id_in_list(ids, n_ids, ies[pos + ext], ies[pos] == WLAN_EID_EXTENSION)) break; if (ies[pos] == WLAN_EID_RIC_DATA && n_after_ric) { pos = skip_ie(ies, ielen, pos); while (pos < ielen) { if (ies[pos] == WLAN_EID_EXTENSION) ext = 2; else ext = 0; if ((pos + ext) >= ielen) break; if (!ieee80211_id_in_list(after_ric, n_after_ric, ies[pos + ext], ext == 2)) pos = skip_ie(ies, ielen, pos); else break; } } else { pos = skip_ie(ies, ielen, pos); } } return pos; } EXPORT_SYMBOL(ieee80211_ie_split_ric); void ieee80211_fragment_element(struct sk_buff *skb, u8 *len_pos, u8 frag_id) { unsigned int elem_len; if (!len_pos) return; elem_len = skb->data + skb->len - len_pos - 1; while (elem_len > 255) { /* this one is 255 */ *len_pos = 255; /* remaining data gets smaller */ elem_len -= 255; /* make space for the fragment ID/len in SKB */ skb_put(skb, 2); /* shift back the remaining data to place fragment ID/len */ memmove(len_pos + 255 + 3, len_pos + 255 + 1, elem_len); /* place the fragment ID */ len_pos += 255 + 1; *len_pos = frag_id; /* and point to fragment length to update later */ len_pos++; } *len_pos = elem_len; } EXPORT_SYMBOL(ieee80211_fragment_element); bool ieee80211_operating_class_to_band(u8 operating_class, enum nl80211_band *band) { switch (operating_class) { case 112: case 115 ... 127: case 128 ... 130: *band = NL80211_BAND_5GHZ; return true; case 131 ... 135: case 137: *band = NL80211_BAND_6GHZ; return true; case 81: case 82: case 83: case 84: *band = NL80211_BAND_2GHZ; return true; case 180: *band = NL80211_BAND_60GHZ; return true; } return false; } EXPORT_SYMBOL(ieee80211_operating_class_to_band); bool ieee80211_operating_class_to_chandef(u8 operating_class, struct ieee80211_channel *chan, struct cfg80211_chan_def *chandef) { u32 control_freq, offset = 0; enum nl80211_band band; if (!ieee80211_operating_class_to_band(operating_class, &band) || !chan || band != chan->band) return false; control_freq = chan->center_freq; chandef->chan = chan; if (control_freq >= 5955) offset = control_freq - 5955; else if (control_freq >= 5745) offset = control_freq - 5745; else if (control_freq >= 5180) offset = control_freq - 5180; offset /= 20; switch (operating_class) { case 81: /* 2 GHz band; 20 MHz; channels 1..13 */ case 82: /* 2 GHz band; 20 MHz; channel 14 */ case 115: /* 5 GHz band; 20 MHz; channels 36,40,44,48 */ case 118: /* 5 GHz band; 20 MHz; channels 52,56,60,64 */ case 121: /* 5 GHz band; 20 MHz; channels 100..144 */ case 124: /* 5 GHz band; 20 MHz; channels 149,153,157,161 */ case 125: /* 5 GHz band; 20 MHz; channels 149..177 */ case 131: /* 6 GHz band; 20 MHz; channels 1..233*/ case 136: /* 6 GHz band; 20 MHz; channel 2 */ chandef->center_freq1 = control_freq; chandef->width = NL80211_CHAN_WIDTH_20; return true; case 83: /* 2 GHz band; 40 MHz; channels 1..9 */ case 116: /* 5 GHz band; 40 MHz; channels 36,44 */ case 119: /* 5 GHz band; 40 MHz; channels 52,60 */ case 122: /* 5 GHz band; 40 MHz; channels 100,108,116,124,132,140 */ case 126: /* 5 GHz band; 40 MHz; channels 149,157,165,173 */ chandef->center_freq1 = control_freq + 10; chandef->width = NL80211_CHAN_WIDTH_40; return true; case 84: /* 2 GHz band; 40 MHz; channels 5..13 */ case 117: /* 5 GHz band; 40 MHz; channels 40,48 */ case 120: /* 5 GHz band; 40 MHz; channels 56,64 */ case 123: /* 5 GHz band; 40 MHz; channels 104,112,120,128,136,144 */ case 127: /* 5 GHz band; 40 MHz; channels 153,161,169,177 */ chandef->center_freq1 = control_freq - 10; chandef->width = NL80211_CHAN_WIDTH_40; return true; case 132: /* 6 GHz band; 40 MHz; channels 1,5,..,229*/ chandef->center_freq1 = control_freq + 10 - (offset & 1) * 20; chandef->width = NL80211_CHAN_WIDTH_40; return true; case 128: /* 5 GHz band; 80 MHz; channels 36..64,100..144,149..177 */ case 133: /* 6 GHz band; 80 MHz; channels 1,5,..,229 */ chandef->center_freq1 = control_freq + 30 - (offset & 3) * 20; chandef->width = NL80211_CHAN_WIDTH_80; return true; case 129: /* 5 GHz band; 160 MHz; channels 36..64,100..144,149..177 */ case 134: /* 6 GHz band; 160 MHz; channels 1,5,..,229 */ chandef->center_freq1 = control_freq + 70 - (offset & 7) * 20; chandef->width = NL80211_CHAN_WIDTH_160; return true; case 130: /* 5 GHz band; 80+80 MHz; channels 36..64,100..144,149..177 */ case 135: /* 6 GHz band; 80+80 MHz; channels 1,5,..,229 */ /* The center_freq2 of 80+80 MHz is unknown */ case 137: /* 6 GHz band; 320 MHz; channels 1,5,..,229 */ /* 320-1 or 320-2 channelization is unknown */ default: return false; } } EXPORT_SYMBOL(ieee80211_operating_class_to_chandef); bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef, u8 *op_class) { u8 vht_opclass; u32 freq = chandef->center_freq1; if (freq >= 2412 && freq <= 2472) { if (chandef->width > NL80211_CHAN_WIDTH_40) return false; /* 2.407 GHz, channels 1..13 */ if (chandef->width == NL80211_CHAN_WIDTH_40) { if (freq > chandef->chan->center_freq) *op_class = 83; /* HT40+ */ else *op_class = 84; /* HT40- */ } else { *op_class = 81; } return true; } if (freq == 2484) { /* channel 14 is only for IEEE 802.11b */ if (chandef->width != NL80211_CHAN_WIDTH_20_NOHT) return false; *op_class = 82; /* channel 14 */ return true; } switch (chandef->width) { case NL80211_CHAN_WIDTH_80: vht_opclass = 128; break; case NL80211_CHAN_WIDTH_160: vht_opclass = 129; break; case NL80211_CHAN_WIDTH_80P80: vht_opclass = 130; break; case NL80211_CHAN_WIDTH_10: case NL80211_CHAN_WIDTH_5: return false; /* unsupported for now */ default: vht_opclass = 0; break; } /* 5 GHz, channels 36..48 */ if (freq >= 5180 && freq <= 5240) { if (vht_opclass) { *op_class = vht_opclass; } else if (chandef->width == NL80211_CHAN_WIDTH_40) { if (freq > chandef->chan->center_freq) *op_class = 116; else *op_class = 117; } else { *op_class = 115; } return true; } /* 5 GHz, channels 52..64 */ if (freq >= 5260 && freq <= 5320) { if (vht_opclass) { *op_class = vht_opclass; } else if (chandef->width == NL80211_CHAN_WIDTH_40) { if (freq > chandef->chan->center_freq) *op_class = 119; else *op_class = 120; } else { *op_class = 118; } return true; } /* 5 GHz, channels 100..144 */ if (freq >= 5500 && freq <= 5720) { if (vht_opclass) { *op_class = vht_opclass; } else if (chandef->width == NL80211_CHAN_WIDTH_40) { if (freq > chandef->chan->center_freq) *op_class = 122; else *op_class = 123; } else { *op_class = 121; } return true; } /* 5 GHz, channels 149..169 */ if (freq >= 5745 && freq <= 5845) { if (vht_opclass) { *op_class = vht_opclass; } else if (chandef->width == NL80211_CHAN_WIDTH_40) { if (freq > chandef->chan->center_freq) *op_class = 126; else *op_class = 127; } else if (freq <= 5805) { *op_class = 124; } else { *op_class = 125; } return true; } /* 56.16 GHz, channel 1..4 */ if (freq >= 56160 + 2160 * 1 && freq <= 56160 + 2160 * 6) { if (chandef->width >= NL80211_CHAN_WIDTH_40) return false; *op_class = 180; return true; } /* not supported yet */ return false; } EXPORT_SYMBOL(ieee80211_chandef_to_operating_class); static int cfg80211_wdev_bi(struct wireless_dev *wdev) { switch (wdev->iftype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: WARN_ON(wdev->valid_links); return wdev->links[0].ap.beacon_interval; case NL80211_IFTYPE_MESH_POINT: return wdev->u.mesh.beacon_interval; case NL80211_IFTYPE_ADHOC: return wdev->u.ibss.beacon_interval; default: break; } return 0; } static void cfg80211_calculate_bi_data(struct wiphy *wiphy, u32 new_beacon_int, u32 *beacon_int_gcd, bool *beacon_int_different) { struct wireless_dev *wdev; *beacon_int_gcd = 0; *beacon_int_different = false; list_for_each_entry(wdev, &wiphy->wdev_list, list) { int wdev_bi; /* this feature isn't supported with MLO */ if (wdev->valid_links) continue; wdev_bi = cfg80211_wdev_bi(wdev); if (!wdev_bi) continue; if (!*beacon_int_gcd) { *beacon_int_gcd = wdev_bi; continue; } if (wdev_bi == *beacon_int_gcd) continue; *beacon_int_different = true; *beacon_int_gcd = gcd(*beacon_int_gcd, wdev_bi); } if (new_beacon_int && *beacon_int_gcd != new_beacon_int) { if (*beacon_int_gcd) *beacon_int_different = true; *beacon_int_gcd = gcd(*beacon_int_gcd, new_beacon_int); } } int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev, enum nl80211_iftype iftype, u32 beacon_int) { /* * This is just a basic pre-condition check; if interface combinations * are possible the driver must already be checking those with a call * to cfg80211_check_combinations(), in which case we'll validate more * through the cfg80211_calculate_bi_data() call and code in * cfg80211_iter_combinations(). */ if (beacon_int < 10 || beacon_int > 10000) return -EINVAL; return 0; } int cfg80211_iter_combinations(struct wiphy *wiphy, struct iface_combination_params *params, void (*iter)(const struct ieee80211_iface_combination *c, void *data), void *data) { const struct ieee80211_regdomain *regdom; enum nl80211_dfs_regions region = 0; int i, j, iftype; int num_interfaces = 0; u32 used_iftypes = 0; u32 beacon_int_gcd; bool beacon_int_different; /* * This is a bit strange, since the iteration used to rely only on * the data given by the driver, but here it now relies on context, * in form of the currently operating interfaces. * This is OK for all current users, and saves us from having to * push the GCD calculations into all the drivers. * In the future, this should probably rely more on data that's in * cfg80211 already - the only thing not would appear to be any new * interfaces (while being brought up) and channel/radar data. */ cfg80211_calculate_bi_data(wiphy, params->new_beacon_int, &beacon_int_gcd, &beacon_int_different); if (params->radar_detect) { rcu_read_lock(); regdom = rcu_dereference(cfg80211_regdomain); if (regdom) region = regdom->dfs_region; rcu_read_unlock(); } for (iftype = 0; iftype < NUM_NL80211_IFTYPES; iftype++) { num_interfaces += params->iftype_num[iftype]; if (params->iftype_num[iftype] > 0 && !cfg80211_iftype_allowed(wiphy, iftype, 0, 1)) used_iftypes |= BIT(iftype); } for (i = 0; i < wiphy->n_iface_combinations; i++) { const struct ieee80211_iface_combination *c; struct ieee80211_iface_limit *limits; u32 all_iftypes = 0; c = &wiphy->iface_combinations[i]; if (num_interfaces > c->max_interfaces) continue; if (params->num_different_channels > c->num_different_channels) continue; limits = kmemdup(c->limits, sizeof(limits[0]) * c->n_limits, GFP_KERNEL); if (!limits) return -ENOMEM; for (iftype = 0; iftype < NUM_NL80211_IFTYPES; iftype++) { if (cfg80211_iftype_allowed(wiphy, iftype, 0, 1)) continue; for (j = 0; j < c->n_limits; j++) { all_iftypes |= limits[j].types; if (!(limits[j].types & BIT(iftype))) continue; if (limits[j].max < params->iftype_num[iftype]) goto cont; limits[j].max -= params->iftype_num[iftype]; } } if (params->radar_detect != (c->radar_detect_widths & params->radar_detect)) goto cont; if (params->radar_detect && c->radar_detect_regions && !(c->radar_detect_regions & BIT(region))) goto cont; /* Finally check that all iftypes that we're currently * using are actually part of this combination. If they * aren't then we can't use this combination and have * to continue to the next. */ if ((all_iftypes & used_iftypes) != used_iftypes) goto cont; if (beacon_int_gcd) { if (c->beacon_int_min_gcd && beacon_int_gcd < c->beacon_int_min_gcd) goto cont; if (!c->beacon_int_min_gcd && beacon_int_different) goto cont; } /* This combination covered all interface types and * supported the requested numbers, so we're good. */ (*iter)(c, data); cont: kfree(limits); } return 0; } EXPORT_SYMBOL(cfg80211_iter_combinations); static void cfg80211_iter_sum_ifcombs(const struct ieee80211_iface_combination *c, void *data) { int *num = data; (*num)++; } int cfg80211_check_combinations(struct wiphy *wiphy, struct iface_combination_params *params) { int err, num = 0; err = cfg80211_iter_combinations(wiphy, params, cfg80211_iter_sum_ifcombs, &num); if (err) return err; if (num == 0) return -EBUSY; return 0; } EXPORT_SYMBOL(cfg80211_check_combinations); int ieee80211_get_ratemask(struct ieee80211_supported_band *sband, const u8 *rates, unsigned int n_rates, u32 *mask) { int i, j; if (!sband) return -EINVAL; if (n_rates == 0 || n_rates > NL80211_MAX_SUPP_RATES) return -EINVAL; *mask = 0; for (i = 0; i < n_rates; i++) { int rate = (rates[i] & 0x7f) * 5; bool found = false; for (j = 0; j < sband->n_bitrates; j++) { if (sband->bitrates[j].bitrate == rate) { found = true; *mask |= BIT(j); break; } } if (!found) return -EINVAL; } /* * mask must have at least one bit set here since we * didn't accept a 0-length rates array nor allowed * entries in the array that didn't exist */ return 0; } unsigned int ieee80211_get_num_supported_channels(struct wiphy *wiphy) { enum nl80211_band band; unsigned int n_channels = 0; for (band = 0; band < NUM_NL80211_BANDS; band++) if (wiphy->bands[band]) n_channels += wiphy->bands[band]->n_channels; return n_channels; } EXPORT_SYMBOL(ieee80211_get_num_supported_channels); int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr, struct station_info *sinfo) { struct cfg80211_registered_device *rdev; struct wireless_dev *wdev; wdev = dev->ieee80211_ptr; if (!wdev) return -EOPNOTSUPP; rdev = wiphy_to_rdev(wdev->wiphy); if (!rdev->ops->get_station) return -EOPNOTSUPP; memset(sinfo, 0, sizeof(*sinfo)); return rdev_get_station(rdev, dev, mac_addr, sinfo); } EXPORT_SYMBOL(cfg80211_get_station); void cfg80211_free_nan_func(struct cfg80211_nan_func *f) { int i; if (!f) return; kfree(f->serv_spec_info); kfree(f->srf_bf); kfree(f->srf_macs); for (i = 0; i < f->num_rx_filters; i++) kfree(f->rx_filters[i].filter); for (i = 0; i < f->num_tx_filters; i++) kfree(f->tx_filters[i].filter); kfree(f->rx_filters); kfree(f->tx_filters); kfree(f); } EXPORT_SYMBOL(cfg80211_free_nan_func); bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range, u32 center_freq_khz, u32 bw_khz) { u32 start_freq_khz, end_freq_khz; start_freq_khz = center_freq_khz - (bw_khz / 2); end_freq_khz = center_freq_khz + (bw_khz / 2); if (start_freq_khz >= freq_range->start_freq_khz && end_freq_khz <= freq_range->end_freq_khz) return true; return false; } int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp) { sinfo->pertid = kcalloc(IEEE80211_NUM_TIDS + 1, sizeof(*(sinfo->pertid)), gfp); if (!sinfo->pertid) return -ENOMEM; return 0; } EXPORT_SYMBOL(cfg80211_sinfo_alloc_tid_stats); /* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */ /* Ethernet-II snap header (RFC1042 for most EtherTypes) */ const unsigned char rfc1042_header[] __aligned(2) = { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0x00 }; EXPORT_SYMBOL(rfc1042_header); /* Bridge-Tunnel header (for EtherTypes ETH_P_AARP and ETH_P_IPX) */ const unsigned char bridge_tunnel_header[] __aligned(2) = { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0xf8 }; EXPORT_SYMBOL(bridge_tunnel_header); /* Layer 2 Update frame (802.2 Type 1 LLC XID Update response) */ struct iapp_layer2_update { u8 da[ETH_ALEN]; /* broadcast */ u8 sa[ETH_ALEN]; /* STA addr */ __be16 len; /* 6 */ u8 dsap; /* 0 */ u8 ssap; /* 0 */ u8 control; u8 xid_info[3]; } __packed; void cfg80211_send_layer2_update(struct net_device *dev, const u8 *addr) { struct iapp_layer2_update *msg; struct sk_buff *skb; /* Send Level 2 Update Frame to update forwarding tables in layer 2 * bridge devices */ skb = dev_alloc_skb(sizeof(*msg)); if (!skb) return; msg = skb_put(skb, sizeof(*msg)); /* 802.2 Type 1 Logical Link Control (LLC) Exchange Identifier (XID) * Update response frame; IEEE Std 802.2-1998, 5.4.1.2.1 */ eth_broadcast_addr(msg->da); ether_addr_copy(msg->sa, addr); msg->len = htons(6); msg->dsap = 0; msg->ssap = 0x01; /* NULL LSAP, CR Bit: Response */ msg->control = 0xaf; /* XID response lsb.1111F101. * F=0 (no poll command; unsolicited frame) */ msg->xid_info[0] = 0x81; /* XID format identifier */ msg->xid_info[1] = 1; /* LLC types/classes: Type 1 LLC */ msg->xid_info[2] = 0; /* XID sender's receive window size (RW) */ skb->dev = dev; skb->protocol = eth_type_trans(skb, dev); memset(skb->cb, 0, sizeof(skb->cb)); netif_rx(skb); } EXPORT_SYMBOL(cfg80211_send_layer2_update); int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, enum ieee80211_vht_chanwidth bw, int mcs, bool ext_nss_bw_capable, unsigned int max_vht_nss) { u16 map = le16_to_cpu(cap->supp_mcs.rx_mcs_map); int ext_nss_bw; int supp_width; int i, mcs_encoding; if (map == 0xffff) return 0; if (WARN_ON(mcs > 9 || max_vht_nss > 8)) return 0; if (mcs <= 7) mcs_encoding = 0; else if (mcs == 8) mcs_encoding = 1; else mcs_encoding = 2; if (!max_vht_nss) { /* find max_vht_nss for the given MCS */ for (i = 7; i >= 0; i--) { int supp = (map >> (2 * i)) & 3; if (supp == 3) continue; if (supp >= mcs_encoding) { max_vht_nss = i + 1; break; } } } if (!(cap->supp_mcs.tx_mcs_map & cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE))) return max_vht_nss; ext_nss_bw = le32_get_bits(cap->vht_cap_info, IEEE80211_VHT_CAP_EXT_NSS_BW_MASK); supp_width = le32_get_bits(cap->vht_cap_info, IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK); /* if not capable, treat ext_nss_bw as 0 */ if (!ext_nss_bw_capable) ext_nss_bw = 0; /* This is invalid */ if (supp_width == 3) return 0; /* This is an invalid combination so pretend nothing is supported */ if (supp_width == 2 && (ext_nss_bw == 1 || ext_nss_bw == 2)) return 0; /* * Cover all the special cases according to IEEE 802.11-2016 * Table 9-250. All other cases are either factor of 1 or not * valid/supported. */ switch (bw) { case IEEE80211_VHT_CHANWIDTH_USE_HT: case IEEE80211_VHT_CHANWIDTH_80MHZ: if ((supp_width == 1 || supp_width == 2) && ext_nss_bw == 3) return 2 * max_vht_nss; break; case IEEE80211_VHT_CHANWIDTH_160MHZ: if (supp_width == 0 && (ext_nss_bw == 1 || ext_nss_bw == 2)) return max_vht_nss / 2; if (supp_width == 0 && ext_nss_bw == 3) return (3 * max_vht_nss) / 4; if (supp_width == 1 && ext_nss_bw == 3) return 2 * max_vht_nss; break; case IEEE80211_VHT_CHANWIDTH_80P80MHZ: if (supp_width == 0 && ext_nss_bw == 1) return 0; /* not possible */ if (supp_width == 0 && ext_nss_bw == 2) return max_vht_nss / 2; if (supp_width == 0 && ext_nss_bw == 3) return (3 * max_vht_nss) / 4; if (supp_width == 1 && ext_nss_bw == 0) return 0; /* not possible */ if (supp_width == 1 && ext_nss_bw == 1) return max_vht_nss / 2; if (supp_width == 1 && ext_nss_bw == 2) return (3 * max_vht_nss) / 4; break; } /* not covered or invalid combination received */ return max_vht_nss; } EXPORT_SYMBOL(ieee80211_get_vht_max_nss); bool cfg80211_iftype_allowed(struct wiphy *wiphy, enum nl80211_iftype iftype, bool is_4addr, u8 check_swif) { bool is_vlan = iftype == NL80211_IFTYPE_AP_VLAN; switch (check_swif) { case 0: if (is_vlan && is_4addr) return wiphy->flags & WIPHY_FLAG_4ADDR_AP; return wiphy->interface_modes & BIT(iftype); case 1: if (!(wiphy->software_iftypes & BIT(iftype)) && is_vlan) return wiphy->flags & WIPHY_FLAG_4ADDR_AP; return wiphy->software_iftypes & BIT(iftype); default: break; } return false; } EXPORT_SYMBOL(cfg80211_iftype_allowed); void cfg80211_remove_link(struct wireless_dev *wdev, unsigned int link_id) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); lockdep_assert_wiphy(wdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: cfg80211_stop_ap(rdev, wdev->netdev, link_id, true); break; default: /* per-link not relevant */ break; } wdev->valid_links &= ~BIT(link_id); rdev_del_intf_link(rdev, wdev, link_id); eth_zero_addr(wdev->links[link_id].addr); } void cfg80211_remove_links(struct wireless_dev *wdev) { unsigned int link_id; /* * links are controlled by upper layers (userspace/cfg) * only for AP mode, so only remove them here for AP */ if (wdev->iftype != NL80211_IFTYPE_AP) return; if (wdev->valid_links) { for_each_valid_link(wdev, link_id) cfg80211_remove_link(wdev, link_id); } } int cfg80211_remove_virtual_intf(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { cfg80211_remove_links(wdev); return rdev_del_virtual_intf(rdev, wdev); } const struct wiphy_iftype_ext_capab * cfg80211_get_iftype_ext_capa(struct wiphy *wiphy, enum nl80211_iftype type) { int i; for (i = 0; i < wiphy->num_iftype_ext_capab; i++) { if (wiphy->iftype_ext_capab[i].iftype == type) return &wiphy->iftype_ext_capab[i]; } return NULL; } EXPORT_SYMBOL(cfg80211_get_iftype_ext_capa); |
3 3 2 3 3 3 3 3 3 2 2 3 3 1 3 2 3 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 | // SPDX-License-Identifier: GPL-2.0-or-later /* * DVB USB framework * * Copyright (C) 2004-6 Patrick Boettcher <patrick.boettcher@posteo.de> * Copyright (C) 2012 Antti Palosaari <crope@iki.fi> */ #include "dvb_usb_common.h" #include <media/media-device.h> static int dvb_usbv2_disable_rc_polling; module_param_named(disable_rc_polling, dvb_usbv2_disable_rc_polling, int, 0644); MODULE_PARM_DESC(disable_rc_polling, "disable remote control polling (default: 0)"); static int dvb_usb_force_pid_filter_usage; module_param_named(force_pid_filter_usage, dvb_usb_force_pid_filter_usage, int, 0444); MODULE_PARM_DESC(force_pid_filter_usage, "force all DVB USB devices to use a PID filter, if any (default: 0)"); static int dvb_usbv2_download_firmware(struct dvb_usb_device *d, const char *name) { int ret; const struct firmware *fw; dev_dbg(&d->udev->dev, "%s:\n", __func__); if (!d->props->download_firmware) { ret = -EINVAL; goto err; } ret = request_firmware(&fw, name, &d->udev->dev); if (ret < 0) { dev_err(&d->udev->dev, "%s: Did not find the firmware file '%s' (status %d). You can use <kernel_dir>/scripts/get_dvb_firmware to get the firmware\n", KBUILD_MODNAME, name, ret); goto err; } dev_info(&d->udev->dev, "%s: downloading firmware from file '%s'\n", KBUILD_MODNAME, name); ret = d->props->download_firmware(d, fw); release_firmware(fw); if (ret < 0) goto err; return ret; err: dev_dbg(&d->udev->dev, "%s: failed=%d\n", __func__, ret); return ret; } static int dvb_usbv2_i2c_init(struct dvb_usb_device *d) { int ret; dev_dbg(&d->udev->dev, "%s:\n", __func__); if (!d->props->i2c_algo) return 0; strscpy(d->i2c_adap.name, d->name, sizeof(d->i2c_adap.name)); d->i2c_adap.algo = d->props->i2c_algo; d->i2c_adap.dev.parent = &d->udev->dev; i2c_set_adapdata(&d->i2c_adap, d); ret = i2c_add_adapter(&d->i2c_adap); if (ret < 0) { d->i2c_adap.algo = NULL; goto err; } return 0; err: dev_dbg(&d->udev->dev, "%s: failed=%d\n", __func__, ret); return ret; } static int dvb_usbv2_i2c_exit(struct dvb_usb_device *d) { dev_dbg(&d->udev->dev, "%s:\n", __func__); if (d->i2c_adap.algo) i2c_del_adapter(&d->i2c_adap); return 0; } #if IS_ENABLED(CONFIG_RC_CORE) static void dvb_usb_read_remote_control(struct work_struct *work) { struct dvb_usb_device *d = container_of(work, struct dvb_usb_device, rc_query_work.work); int ret; /* * When the parameter has been set to 1 via sysfs while the * driver was running, or when bulk mode is enabled after IR init. */ if (dvb_usbv2_disable_rc_polling || d->rc.bulk_mode) { d->rc_polling_active = false; return; } ret = d->rc.query(d); if (ret < 0) { dev_err(&d->udev->dev, "%s: rc.query() failed=%d\n", KBUILD_MODNAME, ret); d->rc_polling_active = false; return; /* stop polling */ } schedule_delayed_work(&d->rc_query_work, msecs_to_jiffies(d->rc.interval)); } static int dvb_usbv2_remote_init(struct dvb_usb_device *d) { int ret; struct rc_dev *dev; dev_dbg(&d->udev->dev, "%s:\n", __func__); if (dvb_usbv2_disable_rc_polling || !d->props->get_rc_config) return 0; d->rc.map_name = d->rc_map; ret = d->props->get_rc_config(d, &d->rc); if (ret < 0) goto err; /* disable rc when there is no keymap defined */ if (!d->rc.map_name) return 0; dev = rc_allocate_device(d->rc.driver_type); if (!dev) { ret = -ENOMEM; goto err; } dev->dev.parent = &d->udev->dev; dev->device_name = d->name; usb_make_path(d->udev, d->rc_phys, sizeof(d->rc_phys)); strlcat(d->rc_phys, "/ir0", sizeof(d->rc_phys)); dev->input_phys = d->rc_phys; usb_to_input_id(d->udev, &dev->input_id); dev->driver_name = d->props->driver_name; dev->map_name = d->rc.map_name; dev->allowed_protocols = d->rc.allowed_protos; dev->change_protocol = d->rc.change_protocol; dev->timeout = d->rc.timeout; dev->priv = d; ret = rc_register_device(dev); if (ret < 0) { rc_free_device(dev); goto err; } d->rc_dev = dev; /* start polling if needed */ if (d->rc.query && !d->rc.bulk_mode) { /* initialize a work queue for handling polling */ INIT_DELAYED_WORK(&d->rc_query_work, dvb_usb_read_remote_control); dev_info(&d->udev->dev, "%s: schedule remote query interval to %d msecs\n", KBUILD_MODNAME, d->rc.interval); schedule_delayed_work(&d->rc_query_work, msecs_to_jiffies(d->rc.interval)); d->rc_polling_active = true; } return 0; err: dev_dbg(&d->udev->dev, "%s: failed=%d\n", __func__, ret); return ret; } static int dvb_usbv2_remote_exit(struct dvb_usb_device *d) { dev_dbg(&d->udev->dev, "%s:\n", __func__); if (d->rc_dev) { cancel_delayed_work_sync(&d->rc_query_work); rc_unregister_device(d->rc_dev); d->rc_dev = NULL; } return 0; } #else #define dvb_usbv2_remote_init(args...) 0 #define dvb_usbv2_remote_exit(args...) #endif static void dvb_usb_data_complete(struct usb_data_stream *stream, u8 *buf, size_t len) { struct dvb_usb_adapter *adap = stream->user_priv; dvb_dmx_swfilter(&adap->demux, buf, len); } static void dvb_usb_data_complete_204(struct usb_data_stream *stream, u8 *buf, size_t len) { struct dvb_usb_adapter *adap = stream->user_priv; dvb_dmx_swfilter_204(&adap->demux, buf, len); } static void dvb_usb_data_complete_raw(struct usb_data_stream *stream, u8 *buf, size_t len) { struct dvb_usb_adapter *adap = stream->user_priv; dvb_dmx_swfilter_raw(&adap->demux, buf, len); } static int dvb_usbv2_adapter_stream_init(struct dvb_usb_adapter *adap) { dev_dbg(&adap_to_d(adap)->udev->dev, "%s: adap=%d\n", __func__, adap->id); adap->stream.udev = adap_to_d(adap)->udev; adap->stream.user_priv = adap; adap->stream.complete = dvb_usb_data_complete; return usb_urb_initv2(&adap->stream, &adap->props->stream); } static int dvb_usbv2_adapter_stream_exit(struct dvb_usb_adapter *adap) { dev_dbg(&adap_to_d(adap)->udev->dev, "%s: adap=%d\n", __func__, adap->id); return usb_urb_exitv2(&adap->stream); } static int dvb_usb_start_feed(struct dvb_demux_feed *dvbdmxfeed) { struct dvb_usb_adapter *adap = dvbdmxfeed->demux->priv; struct dvb_usb_device *d = adap_to_d(adap); int ret = 0; struct usb_data_stream_properties stream_props; dev_dbg(&d->udev->dev, "%s: adap=%d active_fe=%d feed_type=%d setting pid [%s]: %04x (%04d) at index %d\n", __func__, adap->id, adap->active_fe, dvbdmxfeed->type, adap->pid_filtering ? "yes" : "no", dvbdmxfeed->pid, dvbdmxfeed->pid, dvbdmxfeed->index); /* wait init is done */ wait_on_bit(&adap->state_bits, ADAP_INIT, TASK_UNINTERRUPTIBLE); if (adap->active_fe == -1) return -EINVAL; /* skip feed setup if we are already feeding */ if (adap->feed_count++ > 0) goto skip_feed_start; /* set 'streaming' status bit */ set_bit(ADAP_STREAMING, &adap->state_bits); /* resolve input and output streaming parameters */ if (d->props->get_stream_config) { memcpy(&stream_props, &adap->props->stream, sizeof(struct usb_data_stream_properties)); ret = d->props->get_stream_config(adap->fe[adap->active_fe], &adap->ts_type, &stream_props); if (ret) dev_err(&d->udev->dev, "%s: get_stream_config() failed=%d\n", KBUILD_MODNAME, ret); } else { stream_props = adap->props->stream; } switch (adap->ts_type) { case DVB_USB_FE_TS_TYPE_204: adap->stream.complete = dvb_usb_data_complete_204; break; case DVB_USB_FE_TS_TYPE_RAW: adap->stream.complete = dvb_usb_data_complete_raw; break; case DVB_USB_FE_TS_TYPE_188: default: adap->stream.complete = dvb_usb_data_complete; break; } /* submit USB streaming packets */ usb_urb_submitv2(&adap->stream, &stream_props); /* enable HW PID filter */ if (adap->pid_filtering && adap->props->pid_filter_ctrl) { ret = adap->props->pid_filter_ctrl(adap, 1); if (ret) dev_err(&d->udev->dev, "%s: pid_filter_ctrl() failed=%d\n", KBUILD_MODNAME, ret); } /* ask device to start streaming */ if (d->props->streaming_ctrl) { ret = d->props->streaming_ctrl(adap->fe[adap->active_fe], 1); if (ret) dev_err(&d->udev->dev, "%s: streaming_ctrl() failed=%d\n", KBUILD_MODNAME, ret); } skip_feed_start: /* add PID to device HW PID filter */ if (adap->pid_filtering && adap->props->pid_filter) { ret = adap->props->pid_filter(adap, dvbdmxfeed->index, dvbdmxfeed->pid, 1); if (ret) dev_err(&d->udev->dev, "%s: pid_filter() failed=%d\n", KBUILD_MODNAME, ret); } if (ret) dev_dbg(&d->udev->dev, "%s: failed=%d\n", __func__, ret); return ret; } static int dvb_usb_stop_feed(struct dvb_demux_feed *dvbdmxfeed) { struct dvb_usb_adapter *adap = dvbdmxfeed->demux->priv; struct dvb_usb_device *d = adap_to_d(adap); int ret = 0; dev_dbg(&d->udev->dev, "%s: adap=%d active_fe=%d feed_type=%d setting pid [%s]: %04x (%04d) at index %d\n", __func__, adap->id, adap->active_fe, dvbdmxfeed->type, adap->pid_filtering ? "yes" : "no", dvbdmxfeed->pid, dvbdmxfeed->pid, dvbdmxfeed->index); if (adap->active_fe == -1) return -EINVAL; /* remove PID from device HW PID filter */ if (adap->pid_filtering && adap->props->pid_filter) { ret = adap->props->pid_filter(adap, dvbdmxfeed->index, dvbdmxfeed->pid, 0); if (ret) dev_err(&d->udev->dev, "%s: pid_filter() failed=%d\n", KBUILD_MODNAME, ret); } /* we cannot stop streaming until last PID is removed */ if (--adap->feed_count > 0) goto skip_feed_stop; /* ask device to stop streaming */ if (d->props->streaming_ctrl) { ret = d->props->streaming_ctrl(adap->fe[adap->active_fe], 0); if (ret) dev_err(&d->udev->dev, "%s: streaming_ctrl() failed=%d\n", KBUILD_MODNAME, ret); } /* disable HW PID filter */ if (adap->pid_filtering && adap->props->pid_filter_ctrl) { ret = adap->props->pid_filter_ctrl(adap, 0); if (ret) dev_err(&d->udev->dev, "%s: pid_filter_ctrl() failed=%d\n", KBUILD_MODNAME, ret); } /* kill USB streaming packets */ usb_urb_killv2(&adap->stream); /* clear 'streaming' status bit */ clear_bit(ADAP_STREAMING, &adap->state_bits); smp_mb__after_atomic(); wake_up_bit(&adap->state_bits, ADAP_STREAMING); skip_feed_stop: if (ret) dev_dbg(&d->udev->dev, "%s: failed=%d\n", __func__, ret); return ret; } static int dvb_usbv2_media_device_init(struct dvb_usb_adapter *adap) { #ifdef CONFIG_MEDIA_CONTROLLER_DVB struct media_device *mdev; struct dvb_usb_device *d = adap_to_d(adap); struct usb_device *udev = d->udev; mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); if (!mdev) return -ENOMEM; media_device_usb_init(mdev, udev, d->name); dvb_register_media_controller(&adap->dvb_adap, mdev); dev_info(&d->udev->dev, "media controller created\n"); #endif return 0; } static int dvb_usbv2_media_device_register(struct dvb_usb_adapter *adap) { #ifdef CONFIG_MEDIA_CONTROLLER_DVB return media_device_register(adap->dvb_adap.mdev); #else return 0; #endif } static void dvb_usbv2_media_device_unregister(struct dvb_usb_adapter *adap) { #ifdef CONFIG_MEDIA_CONTROLLER_DVB if (!adap->dvb_adap.mdev) return; media_device_unregister(adap->dvb_adap.mdev); media_device_cleanup(adap->dvb_adap.mdev); kfree(adap->dvb_adap.mdev); adap->dvb_adap.mdev = NULL; #endif } static int dvb_usbv2_adapter_dvb_init(struct dvb_usb_adapter *adap) { int ret; struct dvb_usb_device *d = adap_to_d(adap); dev_dbg(&d->udev->dev, "%s: adap=%d\n", __func__, adap->id); ret = dvb_register_adapter(&adap->dvb_adap, d->name, d->props->owner, &d->udev->dev, d->props->adapter_nr); if (ret < 0) { dev_dbg(&d->udev->dev, "%s: dvb_register_adapter() failed=%d\n", __func__, ret); goto err_dvb_register_adapter; } adap->dvb_adap.priv = adap; ret = dvb_usbv2_media_device_init(adap); if (ret < 0) { dev_dbg(&d->udev->dev, "%s: dvb_usbv2_media_device_init() failed=%d\n", __func__, ret); goto err_dvb_register_mc; } if (d->props->read_mac_address) { ret = d->props->read_mac_address(adap, adap->dvb_adap.proposed_mac); if (ret < 0) goto err_dvb_dmx_init; dev_info(&d->udev->dev, "%s: MAC address: %pM\n", KBUILD_MODNAME, adap->dvb_adap.proposed_mac); } adap->demux.dmx.capabilities = DMX_TS_FILTERING | DMX_SECTION_FILTERING; adap->demux.priv = adap; adap->demux.filternum = 0; adap->demux.filternum = adap->max_feed_count; adap->demux.feednum = adap->demux.filternum; adap->demux.start_feed = dvb_usb_start_feed; adap->demux.stop_feed = dvb_usb_stop_feed; adap->demux.write_to_decoder = NULL; ret = dvb_dmx_init(&adap->demux); if (ret < 0) { dev_err(&d->udev->dev, "%s: dvb_dmx_init() failed=%d\n", KBUILD_MODNAME, ret); goto err_dvb_dmx_init; } adap->dmxdev.filternum = adap->demux.filternum; adap->dmxdev.demux = &adap->demux.dmx; adap->dmxdev.capabilities = 0; ret = dvb_dmxdev_init(&adap->dmxdev, &adap->dvb_adap); if (ret < 0) { dev_err(&d->udev->dev, "%s: dvb_dmxdev_init() failed=%d\n", KBUILD_MODNAME, ret); goto err_dvb_dmxdev_init; } ret = dvb_net_init(&adap->dvb_adap, &adap->dvb_net, &adap->demux.dmx); if (ret < 0) { dev_err(&d->udev->dev, "%s: dvb_net_init() failed=%d\n", KBUILD_MODNAME, ret); goto err_dvb_net_init; } return 0; err_dvb_net_init: dvb_dmxdev_release(&adap->dmxdev); err_dvb_dmxdev_init: dvb_dmx_release(&adap->demux); err_dvb_dmx_init: dvb_usbv2_media_device_unregister(adap); err_dvb_register_mc: dvb_unregister_adapter(&adap->dvb_adap); err_dvb_register_adapter: adap->dvb_adap.priv = NULL; return ret; } static int dvb_usbv2_adapter_dvb_exit(struct dvb_usb_adapter *adap) { dev_dbg(&adap_to_d(adap)->udev->dev, "%s: adap=%d\n", __func__, adap->id); if (adap->dvb_adap.priv) { dvb_net_release(&adap->dvb_net); adap->demux.dmx.close(&adap->demux.dmx); dvb_dmxdev_release(&adap->dmxdev); dvb_dmx_release(&adap->demux); dvb_unregister_adapter(&adap->dvb_adap); } return 0; } static int dvb_usbv2_device_power_ctrl(struct dvb_usb_device *d, int onoff) { int ret; if (onoff) d->powered++; else d->powered--; if (d->powered == 0 || (onoff && d->powered == 1)) { /* when switching from 1 to 0 or from 0 to 1 */ dev_dbg(&d->udev->dev, "%s: power=%d\n", __func__, onoff); if (d->props->power_ctrl) { ret = d->props->power_ctrl(d, onoff); if (ret < 0) goto err; } } return 0; err: dev_dbg(&d->udev->dev, "%s: failed=%d\n", __func__, ret); return ret; } static int dvb_usb_fe_init(struct dvb_frontend *fe) { int ret; struct dvb_usb_adapter *adap = fe->dvb->priv; struct dvb_usb_device *d = adap_to_d(adap); dev_dbg(&d->udev->dev, "%s: adap=%d fe=%d\n", __func__, adap->id, fe->id); if (!adap->suspend_resume_active) { adap->active_fe = fe->id; set_bit(ADAP_INIT, &adap->state_bits); } ret = dvb_usbv2_device_power_ctrl(d, 1); if (ret < 0) goto err; if (d->props->frontend_ctrl) { ret = d->props->frontend_ctrl(fe, 1); if (ret < 0) goto err; } if (adap->fe_init[fe->id]) { ret = adap->fe_init[fe->id](fe); if (ret < 0) goto err; } err: if (!adap->suspend_resume_active) { clear_bit(ADAP_INIT, &adap->state_bits); smp_mb__after_atomic(); wake_up_bit(&adap->state_bits, ADAP_INIT); } dev_dbg(&d->udev->dev, "%s: ret=%d\n", __func__, ret); return ret; } static int dvb_usb_fe_sleep(struct dvb_frontend *fe) { int ret; struct dvb_usb_adapter *adap = fe->dvb->priv; struct dvb_usb_device *d = adap_to_d(adap); dev_dbg(&d->udev->dev, "%s: adap=%d fe=%d\n", __func__, adap->id, fe->id); if (!adap->suspend_resume_active) { set_bit(ADAP_SLEEP, &adap->state_bits); wait_on_bit(&adap->state_bits, ADAP_STREAMING, TASK_UNINTERRUPTIBLE); } if (adap->fe_sleep[fe->id]) { ret = adap->fe_sleep[fe->id](fe); if (ret < 0) goto err; } if (d->props->frontend_ctrl) { ret = d->props->frontend_ctrl(fe, 0); if (ret < 0) goto err; } ret = dvb_usbv2_device_power_ctrl(d, 0); err: if (!adap->suspend_resume_active) { adap->active_fe = -1; clear_bit(ADAP_SLEEP, &adap->state_bits); smp_mb__after_atomic(); wake_up_bit(&adap->state_bits, ADAP_SLEEP); } dev_dbg(&d->udev->dev, "%s: ret=%d\n", __func__, ret); return ret; } static int dvb_usbv2_adapter_frontend_init(struct dvb_usb_adapter *adap) { int ret, i, count_registered = 0; struct dvb_usb_device *d = adap_to_d(adap); dev_dbg(&d->udev->dev, "%s: adap=%d\n", __func__, adap->id); memset(adap->fe, 0, sizeof(adap->fe)); adap->active_fe = -1; if (d->props->frontend_attach) { ret = d->props->frontend_attach(adap); if (ret < 0) { dev_dbg(&d->udev->dev, "%s: frontend_attach() failed=%d\n", __func__, ret); goto err_dvb_frontend_detach; } } else { dev_dbg(&d->udev->dev, "%s: frontend_attach() do not exists\n", __func__); ret = 0; goto err; } for (i = 0; i < MAX_NO_OF_FE_PER_ADAP && adap->fe[i]; i++) { adap->fe[i]->id = i; /* re-assign sleep and wakeup functions */ adap->fe_init[i] = adap->fe[i]->ops.init; adap->fe[i]->ops.init = dvb_usb_fe_init; adap->fe_sleep[i] = adap->fe[i]->ops.sleep; adap->fe[i]->ops.sleep = dvb_usb_fe_sleep; ret = dvb_register_frontend(&adap->dvb_adap, adap->fe[i]); if (ret < 0) { dev_err(&d->udev->dev, "%s: frontend%d registration failed\n", KBUILD_MODNAME, i); goto err_dvb_unregister_frontend; } count_registered++; } if (d->props->tuner_attach) { ret = d->props->tuner_attach(adap); if (ret < 0) { dev_dbg(&d->udev->dev, "%s: tuner_attach() failed=%d\n", __func__, ret); goto err_dvb_unregister_frontend; } } ret = dvb_create_media_graph(&adap->dvb_adap, true); if (ret < 0) goto err_dvb_unregister_frontend; ret = dvb_usbv2_media_device_register(adap); return ret; err_dvb_unregister_frontend: for (i = count_registered - 1; i >= 0; i--) dvb_unregister_frontend(adap->fe[i]); err_dvb_frontend_detach: for (i = MAX_NO_OF_FE_PER_ADAP - 1; i >= 0; i--) { if (adap->fe[i]) { dvb_frontend_detach(adap->fe[i]); adap->fe[i] = NULL; } } err: dev_dbg(&d->udev->dev, "%s: failed=%d\n", __func__, ret); return ret; } static int dvb_usbv2_adapter_frontend_exit(struct dvb_usb_adapter *adap) { int ret, i; struct dvb_usb_device *d = adap_to_d(adap); dev_dbg(&d->udev->dev, "%s: adap=%d\n", __func__, adap->id); for (i = MAX_NO_OF_FE_PER_ADAP - 1; i >= 0; i--) { if (adap->fe[i]) { dvb_unregister_frontend(adap->fe[i]); dvb_frontend_detach(adap->fe[i]); } } if (d->props->tuner_detach) { ret = d->props->tuner_detach(adap); if (ret < 0) { dev_dbg(&d->udev->dev, "%s: tuner_detach() failed=%d\n", __func__, ret); } } if (d->props->frontend_detach) { ret = d->props->frontend_detach(adap); if (ret < 0) { dev_dbg(&d->udev->dev, "%s: frontend_detach() failed=%d\n", __func__, ret); } } return 0; } static int dvb_usbv2_adapter_init(struct dvb_usb_device *d) { struct dvb_usb_adapter *adap; int ret, i, adapter_count; /* resolve adapter count */ adapter_count = d->props->num_adapters; if (d->props->get_adapter_count) { ret = d->props->get_adapter_count(d); if (ret < 0) goto err; adapter_count = ret; } for (i = 0; i < adapter_count; i++) { adap = &d->adapter[i]; adap->id = i; adap->props = &d->props->adapter[i]; /* speed - when running at FULL speed we need a HW PID filter */ if (d->udev->speed == USB_SPEED_FULL && !(adap->props->caps & DVB_USB_ADAP_HAS_PID_FILTER)) { dev_err(&d->udev->dev, "%s: this USB2.0 device cannot be run on a USB1.1 port (it lacks a hardware PID filter)\n", KBUILD_MODNAME); ret = -ENODEV; goto err; } else if ((d->udev->speed == USB_SPEED_FULL && adap->props->caps & DVB_USB_ADAP_HAS_PID_FILTER) || (adap->props->caps & DVB_USB_ADAP_NEED_PID_FILTERING)) { dev_info(&d->udev->dev, "%s: will use the device's hardware PID filter (table count: %d)\n", KBUILD_MODNAME, adap->props->pid_filter_count); adap->pid_filtering = 1; adap->max_feed_count = adap->props->pid_filter_count; } else { dev_info(&d->udev->dev, "%s: will pass the complete MPEG2 transport stream to the software demuxer\n", KBUILD_MODNAME); adap->pid_filtering = 0; adap->max_feed_count = 255; } if (!adap->pid_filtering && dvb_usb_force_pid_filter_usage && adap->props->caps & DVB_USB_ADAP_HAS_PID_FILTER) { dev_info(&d->udev->dev, "%s: PID filter enabled by module option\n", KBUILD_MODNAME); adap->pid_filtering = 1; adap->max_feed_count = adap->props->pid_filter_count; } ret = dvb_usbv2_adapter_stream_init(adap); if (ret) goto err; ret = dvb_usbv2_adapter_dvb_init(adap); if (ret) goto err; ret = dvb_usbv2_adapter_frontend_init(adap); if (ret) goto err; /* use exclusive FE lock if there is multiple shared FEs */ if (adap->fe[1]) adap->dvb_adap.mfe_shared = 1; } return 0; err: dev_dbg(&d->udev->dev, "%s: failed=%d\n", __func__, ret); return ret; } static int dvb_usbv2_adapter_exit(struct dvb_usb_device *d) { int i; dev_dbg(&d->udev->dev, "%s:\n", __func__); for (i = MAX_NO_OF_ADAPTER_PER_DEVICE - 1; i >= 0; i--) { if (d->adapter[i].props) { dvb_usbv2_adapter_dvb_exit(&d->adapter[i]); dvb_usbv2_adapter_stream_exit(&d->adapter[i]); dvb_usbv2_adapter_frontend_exit(&d->adapter[i]); dvb_usbv2_media_device_unregister(&d->adapter[i]); } } return 0; } /* general initialization functions */ static int dvb_usbv2_exit(struct dvb_usb_device *d) { dev_dbg(&d->udev->dev, "%s:\n", __func__); dvb_usbv2_remote_exit(d); dvb_usbv2_adapter_exit(d); dvb_usbv2_i2c_exit(d); return 0; } static int dvb_usbv2_init(struct dvb_usb_device *d) { int ret; dev_dbg(&d->udev->dev, "%s:\n", __func__); dvb_usbv2_device_power_ctrl(d, 1); if (d->props->read_config) { ret = d->props->read_config(d); if (ret < 0) goto err; } ret = dvb_usbv2_i2c_init(d); if (ret < 0) goto err; ret = dvb_usbv2_adapter_init(d); if (ret < 0) goto err; if (d->props->init) { ret = d->props->init(d); if (ret < 0) goto err; } ret = dvb_usbv2_remote_init(d); if (ret < 0) goto err; dvb_usbv2_device_power_ctrl(d, 0); return 0; err: dvb_usbv2_device_power_ctrl(d, 0); dev_dbg(&d->udev->dev, "%s: failed=%d\n", __func__, ret); return ret; } int dvb_usbv2_probe(struct usb_interface *intf, const struct usb_device_id *id) { int ret; struct dvb_usb_device *d; struct usb_device *udev = interface_to_usbdev(intf); struct dvb_usb_driver_info *driver_info = (struct dvb_usb_driver_info *) id->driver_info; dev_dbg(&udev->dev, "%s: bInterfaceNumber=%d\n", __func__, intf->cur_altsetting->desc.bInterfaceNumber); if (!id->driver_info) { dev_err(&udev->dev, "%s: driver_info failed\n", KBUILD_MODNAME); ret = -ENODEV; goto err; } d = kzalloc(sizeof(struct dvb_usb_device), GFP_KERNEL); if (!d) { dev_err(&udev->dev, "%s: kzalloc() failed\n", KBUILD_MODNAME); ret = -ENOMEM; goto err; } d->intf = intf; d->name = driver_info->name; d->rc_map = driver_info->rc_map; d->udev = udev; d->props = driver_info->props; if (intf->cur_altsetting->desc.bInterfaceNumber != d->props->bInterfaceNumber) { ret = -ENODEV; goto err_kfree_d; } mutex_init(&d->usb_mutex); mutex_init(&d->i2c_mutex); if (d->props->size_of_priv) { d->priv = kzalloc(d->props->size_of_priv, GFP_KERNEL); if (!d->priv) { dev_err(&d->udev->dev, "%s: kzalloc() failed\n", KBUILD_MODNAME); ret = -ENOMEM; goto err_kfree_d; } } if (d->props->probe) { ret = d->props->probe(d); if (ret) goto err_kfree_priv; } if (d->props->identify_state) { const char *name = NULL; ret = d->props->identify_state(d, &name); if (ret == COLD) { dev_info(&d->udev->dev, "%s: found a '%s' in cold state\n", KBUILD_MODNAME, d->name); if (!name) name = d->props->firmware; ret = dvb_usbv2_download_firmware(d, name); if (ret == 0) { /* device is warm, continue initialization */ ; } else if (ret == RECONNECTS_USB) { /* * USB core will call disconnect() and then * probe() as device reconnects itself from the * USB bus. disconnect() will release all driver * resources and probe() is called for 'new' * device. As 'new' device is warm we should * never go here again. */ goto exit; } else { goto err_free_all; } } else if (ret != WARM) { goto err_free_all; } } dev_info(&d->udev->dev, "%s: found a '%s' in warm state\n", KBUILD_MODNAME, d->name); ret = dvb_usbv2_init(d); if (ret < 0) goto err_free_all; dev_info(&d->udev->dev, "%s: '%s' successfully initialized and connected\n", KBUILD_MODNAME, d->name); exit: usb_set_intfdata(intf, d); return 0; err_free_all: dvb_usbv2_exit(d); if (d->props->disconnect) d->props->disconnect(d); err_kfree_priv: kfree(d->priv); err_kfree_d: kfree(d); err: dev_dbg(&udev->dev, "%s: failed=%d\n", __func__, ret); return ret; } EXPORT_SYMBOL(dvb_usbv2_probe); void dvb_usbv2_disconnect(struct usb_interface *intf) { struct dvb_usb_device *d = usb_get_intfdata(intf); const char *devname = kstrdup(dev_name(&d->udev->dev), GFP_KERNEL); const char *drvname = d->name; dev_dbg(&d->udev->dev, "%s: bInterfaceNumber=%d\n", __func__, intf->cur_altsetting->desc.bInterfaceNumber); if (d->props->exit) d->props->exit(d); dvb_usbv2_exit(d); if (d->props->disconnect) d->props->disconnect(d); kfree(d->priv); kfree(d); pr_info("%s: '%s:%s' successfully deinitialized and disconnected\n", KBUILD_MODNAME, drvname, devname); kfree(devname); } EXPORT_SYMBOL(dvb_usbv2_disconnect); int dvb_usbv2_suspend(struct usb_interface *intf, pm_message_t msg) { struct dvb_usb_device *d = usb_get_intfdata(intf); int ret = 0, i, active_fe; struct dvb_frontend *fe; dev_dbg(&d->udev->dev, "%s:\n", __func__); /* stop remote controller poll */ if (d->rc_polling_active) cancel_delayed_work_sync(&d->rc_query_work); for (i = MAX_NO_OF_ADAPTER_PER_DEVICE - 1; i >= 0; i--) { active_fe = d->adapter[i].active_fe; if (d->adapter[i].dvb_adap.priv && active_fe != -1) { fe = d->adapter[i].fe[active_fe]; d->adapter[i].suspend_resume_active = true; if (d->props->streaming_ctrl) d->props->streaming_ctrl(fe, 0); /* stop usb streaming */ usb_urb_killv2(&d->adapter[i].stream); ret = dvb_frontend_suspend(fe); } } return ret; } EXPORT_SYMBOL(dvb_usbv2_suspend); static int dvb_usbv2_resume_common(struct dvb_usb_device *d) { int ret = 0, i, active_fe; struct dvb_frontend *fe; dev_dbg(&d->udev->dev, "%s:\n", __func__); for (i = 0; i < MAX_NO_OF_ADAPTER_PER_DEVICE; i++) { active_fe = d->adapter[i].active_fe; if (d->adapter[i].dvb_adap.priv && active_fe != -1) { fe = d->adapter[i].fe[active_fe]; ret = dvb_frontend_resume(fe); /* resume usb streaming */ usb_urb_submitv2(&d->adapter[i].stream, NULL); if (d->props->streaming_ctrl) d->props->streaming_ctrl(fe, 1); d->adapter[i].suspend_resume_active = false; } } /* start remote controller poll */ if (d->rc_polling_active) schedule_delayed_work(&d->rc_query_work, msecs_to_jiffies(d->rc.interval)); return ret; } int dvb_usbv2_resume(struct usb_interface *intf) { struct dvb_usb_device *d = usb_get_intfdata(intf); dev_dbg(&d->udev->dev, "%s:\n", __func__); return dvb_usbv2_resume_common(d); } EXPORT_SYMBOL(dvb_usbv2_resume); int dvb_usbv2_reset_resume(struct usb_interface *intf) { struct dvb_usb_device *d = usb_get_intfdata(intf); int ret; dev_dbg(&d->udev->dev, "%s:\n", __func__); dvb_usbv2_device_power_ctrl(d, 1); if (d->props->init) d->props->init(d); ret = dvb_usbv2_resume_common(d); dvb_usbv2_device_power_ctrl(d, 0); return ret; } EXPORT_SYMBOL(dvb_usbv2_reset_resume); MODULE_VERSION("2.0"); MODULE_AUTHOR("Patrick Boettcher <patrick.boettcher@posteo.de>"); MODULE_AUTHOR("Antti Palosaari <crope@iki.fi>"); MODULE_DESCRIPTION("DVB USB common"); MODULE_LICENSE("GPL"); |
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 1 1 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 | // SPDX-License-Identifier: GPL-2.0-or-later /* md.c : Multiple Devices driver for Linux Copyright (C) 1998, 1999, 2000 Ingo Molnar completely rewritten, based on the MD driver code from Marc Zyngier Changes: - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> - kerneld support by Boris Tobotras <boris@xtalk.msk.su> - kmod support by: Cyrus Durgin - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> - lots of fixes and improvements to the RAID1/RAID5 and generic RAID code (such as request based resynchronization): Neil Brown <neilb@cse.unsw.edu.au>. - persistent bitmap code Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. Errors, Warnings, etc. Please use: pr_crit() for error conditions that risk data loss pr_err() for error conditions that are unexpected, like an IO error or internal inconsistency pr_warn() for error conditions that could have been predicated, like adding a device to an array when it has incompatible metadata pr_info() for every interesting, very rare events, like an array starting or stopping, or resync starting or stopping pr_debug() for everything else. */ #include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/kthread.h> #include <linux/blkdev.h> #include <linux/blk-integrity.h> #include <linux/badblocks.h> #include <linux/sysctl.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/poll.h> #include <linux/ctype.h> #include <linux/string.h> #include <linux/hdreg.h> #include <linux/proc_fs.h> #include <linux/random.h> #include <linux/major.h> #include <linux/module.h> #include <linux/reboot.h> #include <linux/file.h> #include <linux/compat.h> #include <linux/delay.h> #include <linux/raid/md_p.h> #include <linux/raid/md_u.h> #include <linux/raid/detect.h> #include <linux/slab.h> #include <linux/percpu-refcount.h> #include <linux/part_stat.h> #include "md.h" #include "md-bitmap.h" #include "md-cluster.h" /* pers_list is a list of registered personalities protected by pers_lock. */ static LIST_HEAD(pers_list); static DEFINE_SPINLOCK(pers_lock); static const struct kobj_type md_ktype; struct md_cluster_operations *md_cluster_ops; EXPORT_SYMBOL(md_cluster_ops); static struct module *md_cluster_mod; static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static struct workqueue_struct *md_wq; /* * This workqueue is used for sync_work to register new sync_thread, and for * del_work to remove rdev, and for event_work that is only set by dm-raid. * * Noted that sync_work will grab reconfig_mutex, hence never flush this * workqueue whith reconfig_mutex grabbed. */ static struct workqueue_struct *md_misc_wq; struct workqueue_struct *md_bitmap_wq; static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); static void mddev_detach(struct mddev *mddev); static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); static void md_wakeup_thread_directly(struct md_thread __rcu *thread); /* * Default number of read corrections we'll attempt on an rdev * before ejecting it from the array. We divide the read error * count by 2 for every hour elapsed between read errors. */ #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 /* Default safemode delay: 200 msec */ #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) /* * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' * is 1000 KB/sec, so the extra system load does not show up that much. * Increase it if you want to have more _guaranteed_ speed. Note that * the RAID driver will use the maximum available bandwidth if the IO * subsystem is idle. There is also an 'absolute maximum' reconstruction * speed limit - in case reconstruction slows down your system despite * idle IO detection. * * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. * or /sys/block/mdX/md/sync_speed_{min,max} */ static int sysctl_speed_limit_min = 1000; static int sysctl_speed_limit_max = 200000; static inline int speed_min(struct mddev *mddev) { return mddev->sync_speed_min ? mddev->sync_speed_min : sysctl_speed_limit_min; } static inline int speed_max(struct mddev *mddev) { return mddev->sync_speed_max ? mddev->sync_speed_max : sysctl_speed_limit_max; } static void rdev_uninit_serial(struct md_rdev *rdev) { if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) return; kvfree(rdev->serial); rdev->serial = NULL; } static void rdevs_uninit_serial(struct mddev *mddev) { struct md_rdev *rdev; rdev_for_each(rdev, mddev) rdev_uninit_serial(rdev); } static int rdev_init_serial(struct md_rdev *rdev) { /* serial_nums equals with BARRIER_BUCKETS_NR */ int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t)))); struct serial_in_rdev *serial = NULL; if (test_bit(CollisionCheck, &rdev->flags)) return 0; serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums, GFP_KERNEL); if (!serial) return -ENOMEM; for (i = 0; i < serial_nums; i++) { struct serial_in_rdev *serial_tmp = &serial[i]; spin_lock_init(&serial_tmp->serial_lock); serial_tmp->serial_rb = RB_ROOT_CACHED; init_waitqueue_head(&serial_tmp->serial_io_wait); } rdev->serial = serial; set_bit(CollisionCheck, &rdev->flags); return 0; } static int rdevs_init_serial(struct mddev *mddev) { struct md_rdev *rdev; int ret = 0; rdev_for_each(rdev, mddev) { ret = rdev_init_serial(rdev); if (ret) break; } /* Free all resources if pool is not existed */ if (ret && !mddev->serial_info_pool) rdevs_uninit_serial(mddev); return ret; } /* * rdev needs to enable serial stuffs if it meets the conditions: * 1. it is multi-queue device flaged with writemostly. * 2. the write-behind mode is enabled. */ static int rdev_need_serial(struct md_rdev *rdev) { return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 && rdev->bdev->bd_disk->queue->nr_hw_queues != 1 && test_bit(WriteMostly, &rdev->flags)); } /* * Init resource for rdev(s), then create serial_info_pool if: * 1. rdev is the first device which return true from rdev_enable_serial. * 2. rdev is NULL, means we want to enable serialization for all rdevs. */ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev) { int ret = 0; if (rdev && !rdev_need_serial(rdev) && !test_bit(CollisionCheck, &rdev->flags)) return; if (!rdev) ret = rdevs_init_serial(mddev); else ret = rdev_init_serial(rdev); if (ret) return; if (mddev->serial_info_pool == NULL) { /* * already in memalloc noio context by * mddev_suspend() */ mddev->serial_info_pool = mempool_create_kmalloc_pool(NR_SERIAL_INFOS, sizeof(struct serial_info)); if (!mddev->serial_info_pool) { rdevs_uninit_serial(mddev); pr_err("can't alloc memory pool for serialization\n"); } } } /* * Free resource from rdev(s), and destroy serial_info_pool under conditions: * 1. rdev is the last device flaged with CollisionCheck. * 2. when bitmap is destroyed while policy is not enabled. * 3. for disable policy, the pool is destroyed only when no rdev needs it. */ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) { if (rdev && !test_bit(CollisionCheck, &rdev->flags)) return; if (mddev->serial_info_pool) { struct md_rdev *temp; int num = 0; /* used to track if other rdevs need the pool */ rdev_for_each(temp, mddev) { if (!rdev) { if (!mddev->serialize_policy || !rdev_need_serial(temp)) rdev_uninit_serial(temp); else num++; } else if (temp != rdev && test_bit(CollisionCheck, &temp->flags)) num++; } if (rdev) rdev_uninit_serial(rdev); if (num) pr_info("The mempool could be used by other devices\n"); else { mempool_destroy(mddev->serial_info_pool); mddev->serial_info_pool = NULL; } } } static struct ctl_table_header *raid_table_header; static struct ctl_table raid_table[] = { { .procname = "speed_limit_min", .data = &sysctl_speed_limit_min, .maxlen = sizeof(int), .mode = S_IRUGO|S_IWUSR, .proc_handler = proc_dointvec, }, { .procname = "speed_limit_max", .data = &sysctl_speed_limit_max, .maxlen = sizeof(int), .mode = S_IRUGO|S_IWUSR, .proc_handler = proc_dointvec, }, }; static int start_readonly; /* * The original mechanism for creating an md device is to create * a device node in /dev and to open it. This causes races with device-close. * The preferred method is to write to the "new_array" module parameter. * This can avoid races. * Setting create_on_open to false disables the original mechanism * so all the races disappear. */ static bool create_on_open = true; /* * We have a system wide 'event count' that is incremented * on any 'interesting' event, and readers of /proc/mdstat * can use 'poll' or 'select' to find out when the event * count increases. * * Events are: * start array, stop array, error, add device, remove device, * start build, activate spare */ static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); static atomic_t md_event_count; void md_new_event(void) { atomic_inc(&md_event_count); wake_up(&md_event_waiters); } EXPORT_SYMBOL_GPL(md_new_event); /* * Enables to iterate over all existing md arrays * all_mddevs_lock protects this list. */ static LIST_HEAD(all_mddevs); static DEFINE_SPINLOCK(all_mddevs_lock); static bool is_md_suspended(struct mddev *mddev) { return percpu_ref_is_dying(&mddev->active_io); } /* Rather than calling directly into the personality make_request function, * IO requests come here first so that we can check if the device is * being suspended pending a reconfiguration. * We hold a refcount over the call to ->make_request. By the time that * call has finished, the bio has been linked into some internal structure * and so is visible to ->quiesce(), so we don't need the refcount any more. */ static bool is_suspended(struct mddev *mddev, struct bio *bio) { if (is_md_suspended(mddev)) return true; if (bio_data_dir(bio) != WRITE) return false; if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi)) return false; if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi)) return false; if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo)) return false; return true; } bool md_handle_request(struct mddev *mddev, struct bio *bio) { check_suspended: if (is_suspended(mddev, bio)) { DEFINE_WAIT(__wait); /* Bail out if REQ_NOWAIT is set for the bio */ if (bio->bi_opf & REQ_NOWAIT) { bio_wouldblock_error(bio); return true; } for (;;) { prepare_to_wait(&mddev->sb_wait, &__wait, TASK_UNINTERRUPTIBLE); if (!is_suspended(mddev, bio)) break; schedule(); } finish_wait(&mddev->sb_wait, &__wait); } if (!percpu_ref_tryget_live(&mddev->active_io)) goto check_suspended; if (!mddev->pers->make_request(mddev, bio)) { percpu_ref_put(&mddev->active_io); if (!mddev->gendisk && mddev->pers->prepare_suspend) return false; goto check_suspended; } percpu_ref_put(&mddev->active_io); return true; } EXPORT_SYMBOL(md_handle_request); static void md_submit_bio(struct bio *bio) { const int rw = bio_data_dir(bio); struct mddev *mddev = bio->bi_bdev->bd_disk->private_data; if (mddev == NULL || mddev->pers == NULL) { bio_io_error(bio); return; } if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { bio_io_error(bio); return; } bio = bio_split_to_limits(bio); if (!bio) return; if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) { if (bio_sectors(bio) != 0) bio->bi_status = BLK_STS_IOERR; bio_endio(bio); return; } /* bio could be mergeable after passing to underlayer */ bio->bi_opf &= ~REQ_NOMERGE; md_handle_request(mddev, bio); } /* * Make sure no new requests are submitted to the device, and any requests that * have been submitted are completely handled. */ int mddev_suspend(struct mddev *mddev, bool interruptible) { int err = 0; /* * hold reconfig_mutex to wait for normal io will deadlock, because * other context can't update super_block, and normal io can rely on * updating super_block. */ lockdep_assert_not_held(&mddev->reconfig_mutex); if (interruptible) err = mutex_lock_interruptible(&mddev->suspend_mutex); else mutex_lock(&mddev->suspend_mutex); if (err) return err; if (mddev->suspended) { WRITE_ONCE(mddev->suspended, mddev->suspended + 1); mutex_unlock(&mddev->suspend_mutex); return 0; } percpu_ref_kill(&mddev->active_io); if (interruptible) err = wait_event_interruptible(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io)); else wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io)); if (err) { percpu_ref_resurrect(&mddev->active_io); mutex_unlock(&mddev->suspend_mutex); return err; } /* * For raid456, io might be waiting for reshape to make progress, * allow new reshape to start while waiting for io to be done to * prevent deadlock. */ WRITE_ONCE(mddev->suspended, mddev->suspended + 1); del_timer_sync(&mddev->safemode_timer); /* restrict memory reclaim I/O during raid array is suspend */ mddev->noio_flag = memalloc_noio_save(); mutex_unlock(&mddev->suspend_mutex); return 0; } EXPORT_SYMBOL_GPL(mddev_suspend); static void __mddev_resume(struct mddev *mddev, bool recovery_needed) { lockdep_assert_not_held(&mddev->reconfig_mutex); mutex_lock(&mddev->suspend_mutex); WRITE_ONCE(mddev->suspended, mddev->suspended - 1); if (mddev->suspended) { mutex_unlock(&mddev->suspend_mutex); return; } /* entred the memalloc scope from mddev_suspend() */ memalloc_noio_restore(mddev->noio_flag); percpu_ref_resurrect(&mddev->active_io); wake_up(&mddev->sb_wait); if (recovery_needed) set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ mutex_unlock(&mddev->suspend_mutex); } void mddev_resume(struct mddev *mddev) { return __mddev_resume(mddev, true); } EXPORT_SYMBOL_GPL(mddev_resume); /* sync bdev before setting device to readonly or stopping raid*/ static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_num) { mutex_lock(&mddev->open_mutex); if (mddev->pers && atomic_read(&mddev->openers) > opener_num) { mutex_unlock(&mddev->open_mutex); return -EBUSY; } if (test_and_set_bit(MD_CLOSING, &mddev->flags)) { mutex_unlock(&mddev->open_mutex); return -EBUSY; } mutex_unlock(&mddev->open_mutex); sync_blockdev(mddev->gendisk->part0); return 0; } /* * Generic flush handling for md */ static void md_end_flush(struct bio *bio) { struct md_rdev *rdev = bio->bi_private; struct mddev *mddev = rdev->mddev; bio_put(bio); rdev_dec_pending(rdev, mddev); if (atomic_dec_and_test(&mddev->flush_pending)) { /* The pair is percpu_ref_get() from md_flush_request() */ percpu_ref_put(&mddev->active_io); /* The pre-request flush has finished */ queue_work(md_wq, &mddev->flush_work); } } static void md_submit_flush_data(struct work_struct *ws); static void submit_flushes(struct work_struct *ws) { struct mddev *mddev = container_of(ws, struct mddev, flush_work); struct md_rdev *rdev; mddev->start_flush = ktime_get_boottime(); INIT_WORK(&mddev->flush_work, md_submit_flush_data); atomic_set(&mddev->flush_pending, 1); rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) { struct bio *bi; atomic_inc(&rdev->nr_pending); rcu_read_unlock(); bi = bio_alloc_bioset(rdev->bdev, 0, REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO, &mddev->bio_set); bi->bi_end_io = md_end_flush; bi->bi_private = rdev; atomic_inc(&mddev->flush_pending); submit_bio(bi); rcu_read_lock(); } rcu_read_unlock(); if (atomic_dec_and_test(&mddev->flush_pending)) { /* The pair is percpu_ref_get() from md_flush_request() */ percpu_ref_put(&mddev->active_io); queue_work(md_wq, &mddev->flush_work); } } static void md_submit_flush_data(struct work_struct *ws) { struct mddev *mddev = container_of(ws, struct mddev, flush_work); struct bio *bio = mddev->flush_bio; /* * must reset flush_bio before calling into md_handle_request to avoid a * deadlock, because other bios passed md_handle_request suspend check * could wait for this and below md_handle_request could wait for those * bios because of suspend check */ spin_lock_irq(&mddev->lock); mddev->prev_flush_start = mddev->start_flush; mddev->flush_bio = NULL; spin_unlock_irq(&mddev->lock); wake_up(&mddev->sb_wait); if (bio->bi_iter.bi_size == 0) { /* an empty barrier - all done */ bio_endio(bio); } else { bio->bi_opf &= ~REQ_PREFLUSH; md_handle_request(mddev, bio); } } /* * Manages consolidation of flushes and submitting any flushes needed for * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is * being finished in another context. Returns false if the flushing is * complete but still needs the I/O portion of the bio to be processed. */ bool md_flush_request(struct mddev *mddev, struct bio *bio) { ktime_t req_start = ktime_get_boottime(); spin_lock_irq(&mddev->lock); /* flush requests wait until ongoing flush completes, * hence coalescing all the pending requests. */ wait_event_lock_irq(mddev->sb_wait, !mddev->flush_bio || ktime_before(req_start, mddev->prev_flush_start), mddev->lock); /* new request after previous flush is completed */ if (ktime_after(req_start, mddev->prev_flush_start)) { WARN_ON(mddev->flush_bio); /* * Grab a reference to make sure mddev_suspend() will wait for * this flush to be done. * * md_flush_reqeust() is called under md_handle_request() and * 'active_io' is already grabbed, hence percpu_ref_is_zero() * won't pass, percpu_ref_tryget_live() can't be used because * percpu_ref_kill() can be called by mddev_suspend() * concurrently. */ WARN_ON(percpu_ref_is_zero(&mddev->active_io)); percpu_ref_get(&mddev->active_io); mddev->flush_bio = bio; bio = NULL; } spin_unlock_irq(&mddev->lock); if (!bio) { INIT_WORK(&mddev->flush_work, submit_flushes); queue_work(md_wq, &mddev->flush_work); } else { /* flush was performed for some other bio while we waited. */ if (bio->bi_iter.bi_size == 0) /* an empty barrier - all done */ bio_endio(bio); else { bio->bi_opf &= ~REQ_PREFLUSH; return false; } } return true; } EXPORT_SYMBOL(md_flush_request); static inline struct mddev *mddev_get(struct mddev *mddev) { lockdep_assert_held(&all_mddevs_lock); if (test_bit(MD_DELETED, &mddev->flags)) return NULL; atomic_inc(&mddev->active); return mddev; } static void mddev_delayed_delete(struct work_struct *ws); static void __mddev_put(struct mddev *mddev) { if (mddev->raid_disks || !list_empty(&mddev->disks) || mddev->ctime || mddev->hold_active) return; /* Array is not configured at all, and not held active, so destroy it */ set_bit(MD_DELETED, &mddev->flags); /* * Call queue_work inside the spinlock so that flush_workqueue() after * mddev_find will succeed in waiting for the work to be done. */ queue_work(md_misc_wq, &mddev->del_work); } void mddev_put(struct mddev *mddev) { if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) return; __mddev_put(mddev); spin_unlock(&all_mddevs_lock); } static void md_safemode_timeout(struct timer_list *t); static void md_start_sync(struct work_struct *ws); static void active_io_release(struct percpu_ref *ref) { struct mddev *mddev = container_of(ref, struct mddev, active_io); wake_up(&mddev->sb_wait); } static void no_op(struct percpu_ref *r) {} int mddev_init(struct mddev *mddev) { if (percpu_ref_init(&mddev->active_io, active_io_release, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) return -ENOMEM; if (percpu_ref_init(&mddev->writes_pending, no_op, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { percpu_ref_exit(&mddev->active_io); return -ENOMEM; } /* We want to start with the refcount at zero */ percpu_ref_put(&mddev->writes_pending); mutex_init(&mddev->open_mutex); mutex_init(&mddev->reconfig_mutex); mutex_init(&mddev->sync_mutex); mutex_init(&mddev->suspend_mutex); mutex_init(&mddev->bitmap_info.mutex); INIT_LIST_HEAD(&mddev->disks); INIT_LIST_HEAD(&mddev->all_mddevs); INIT_LIST_HEAD(&mddev->deleting); timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); atomic_set(&mddev->active, 1); atomic_set(&mddev->openers, 0); atomic_set(&mddev->sync_seq, 0); spin_lock_init(&mddev->lock); atomic_set(&mddev->flush_pending, 0); init_waitqueue_head(&mddev->sb_wait); init_waitqueue_head(&mddev->recovery_wait); mddev->reshape_position = MaxSector; mddev->reshape_backwards = 0; mddev->last_sync_action = "none"; mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->level = LEVEL_NONE; INIT_WORK(&mddev->sync_work, md_start_sync); INIT_WORK(&mddev->del_work, mddev_delayed_delete); return 0; } EXPORT_SYMBOL_GPL(mddev_init); void mddev_destroy(struct mddev *mddev) { percpu_ref_exit(&mddev->active_io); percpu_ref_exit(&mddev->writes_pending); } EXPORT_SYMBOL_GPL(mddev_destroy); static struct mddev *mddev_find_locked(dev_t unit) { struct mddev *mddev; list_for_each_entry(mddev, &all_mddevs, all_mddevs) if (mddev->unit == unit) return mddev; return NULL; } /* find an unused unit number */ static dev_t mddev_alloc_unit(void) { static int next_minor = 512; int start = next_minor; bool is_free = 0; dev_t dev = 0; while (!is_free) { dev = MKDEV(MD_MAJOR, next_minor); next_minor++; if (next_minor > MINORMASK) next_minor = 0; if (next_minor == start) return 0; /* Oh dear, all in use. */ is_free = !mddev_find_locked(dev); } return dev; } static struct mddev *mddev_alloc(dev_t unit) { struct mddev *new; int error; if (unit && MAJOR(unit) != MD_MAJOR) unit &= ~((1 << MdpMinorShift) - 1); new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) return ERR_PTR(-ENOMEM); error = mddev_init(new); if (error) goto out_free_new; spin_lock(&all_mddevs_lock); if (unit) { error = -EEXIST; if (mddev_find_locked(unit)) goto out_destroy_new; new->unit = unit; if (MAJOR(unit) == MD_MAJOR) new->md_minor = MINOR(unit); else new->md_minor = MINOR(unit) >> MdpMinorShift; new->hold_active = UNTIL_IOCTL; } else { error = -ENODEV; new->unit = mddev_alloc_unit(); if (!new->unit) goto out_destroy_new; new->md_minor = MINOR(new->unit); new->hold_active = UNTIL_STOP; } list_add(&new->all_mddevs, &all_mddevs); spin_unlock(&all_mddevs_lock); return new; out_destroy_new: spin_unlock(&all_mddevs_lock); mddev_destroy(new); out_free_new: kfree(new); return ERR_PTR(error); } static void mddev_free(struct mddev *mddev) { spin_lock(&all_mddevs_lock); list_del(&mddev->all_mddevs); spin_unlock(&all_mddevs_lock); mddev_destroy(mddev); kfree(mddev); } static const struct attribute_group md_redundancy_group; void mddev_unlock(struct mddev *mddev) { struct md_rdev *rdev; struct md_rdev *tmp; LIST_HEAD(delete); if (!list_empty(&mddev->deleting)) list_splice_init(&mddev->deleting, &delete); if (mddev->to_remove) { /* These cannot be removed under reconfig_mutex as * an access to the files will try to take reconfig_mutex * while holding the file unremovable, which leads to * a deadlock. * So hold set sysfs_active while the remove in happeing, * and anything else which might set ->to_remove or my * otherwise change the sysfs namespace will fail with * -EBUSY if sysfs_active is still set. * We set sysfs_active under reconfig_mutex and elsewhere * test it under the same mutex to ensure its correct value * is seen. */ const struct attribute_group *to_remove = mddev->to_remove; mddev->to_remove = NULL; mddev->sysfs_active = 1; mutex_unlock(&mddev->reconfig_mutex); if (mddev->kobj.sd) { if (to_remove != &md_redundancy_group) sysfs_remove_group(&mddev->kobj, to_remove); if (mddev->pers == NULL || mddev->pers->sync_request == NULL) { sysfs_remove_group(&mddev->kobj, &md_redundancy_group); if (mddev->sysfs_action) sysfs_put(mddev->sysfs_action); if (mddev->sysfs_completed) sysfs_put(mddev->sysfs_completed); if (mddev->sysfs_degraded) sysfs_put(mddev->sysfs_degraded); mddev->sysfs_action = NULL; mddev->sysfs_completed = NULL; mddev->sysfs_degraded = NULL; } } mddev->sysfs_active = 0; } else mutex_unlock(&mddev->reconfig_mutex); md_wakeup_thread(mddev->thread); wake_up(&mddev->sb_wait); list_for_each_entry_safe(rdev, tmp, &delete, same_set) { list_del_init(&rdev->same_set); kobject_del(&rdev->kobj); export_rdev(rdev, mddev); } } EXPORT_SYMBOL_GPL(mddev_unlock); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) { struct md_rdev *rdev; rdev_for_each_rcu(rdev, mddev) if (rdev->desc_nr == nr) return rdev; return NULL; } EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) { struct md_rdev *rdev; rdev_for_each(rdev, mddev) if (rdev->bdev->bd_dev == dev) return rdev; return NULL; } struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev) { struct md_rdev *rdev; rdev_for_each_rcu(rdev, mddev) if (rdev->bdev->bd_dev == dev) return rdev; return NULL; } EXPORT_SYMBOL_GPL(md_find_rdev_rcu); static struct md_personality *find_pers(int level, char *clevel) { struct md_personality *pers; list_for_each_entry(pers, &pers_list, list) { if (level != LEVEL_NONE && pers->level == level) return pers; if (strcmp(pers->name, clevel)==0) return pers; } return NULL; } /* return the offset of the super block in 512byte sectors */ static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) { return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev)); } static int alloc_disk_sb(struct md_rdev *rdev) { rdev->sb_page = alloc_page(GFP_KERNEL); if (!rdev->sb_page) return -ENOMEM; return 0; } void md_rdev_clear(struct md_rdev *rdev) { if (rdev->sb_page) { put_page(rdev->sb_page); rdev->sb_loaded = 0; rdev->sb_page = NULL; rdev->sb_start = 0; rdev->sectors = 0; } if (rdev->bb_page) { put_page(rdev->bb_page); rdev->bb_page = NULL; } badblocks_exit(&rdev->badblocks); } EXPORT_SYMBOL_GPL(md_rdev_clear); static void super_written(struct bio *bio) { struct md_rdev *rdev = bio->bi_private; struct mddev *mddev = rdev->mddev; if (bio->bi_status) { pr_err("md: %s gets error=%d\n", __func__, blk_status_to_errno(bio->bi_status)); md_error(mddev, rdev); if (!test_bit(Faulty, &rdev->flags) && (bio->bi_opf & MD_FAILFAST)) { set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); set_bit(LastDev, &rdev->flags); } } else clear_bit(LastDev, &rdev->flags); bio_put(bio); rdev_dec_pending(rdev, mddev); if (atomic_dec_and_test(&mddev->pending_writes)) wake_up(&mddev->sb_wait); } void md_super_write(struct mddev *mddev, struct md_rdev *rdev, sector_t sector, int size, struct page *page) { /* write first size bytes of page to sector of rdev * Increment mddev->pending_writes before returning * and decrement it on completion, waking up sb_wait * if zero is reached. * If an error occurred, call md_error */ struct bio *bio; if (!page) return; if (test_bit(Faulty, &rdev->flags)) return; bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | REQ_META | REQ_PREFLUSH | REQ_FUA, GFP_NOIO, &mddev->sync_set); atomic_inc(&rdev->nr_pending); bio->bi_iter.bi_sector = sector; __bio_add_page(bio, page, size, 0); bio->bi_private = rdev; bio->bi_end_io = super_written; if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && test_bit(FailFast, &rdev->flags) && !test_bit(LastDev, &rdev->flags)) bio->bi_opf |= MD_FAILFAST; atomic_inc(&mddev->pending_writes); submit_bio(bio); } int md_super_wait(struct mddev *mddev) { /* wait for all superblock writes that were scheduled to complete */ wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags)) return -EAGAIN; return 0; } int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, blk_opf_t opf, bool metadata_op) { struct bio bio; struct bio_vec bvec; if (metadata_op && rdev->meta_bdev) bio_init(&bio, rdev->meta_bdev, &bvec, 1, opf); else bio_init(&bio, rdev->bdev, &bvec, 1, opf); if (metadata_op) bio.bi_iter.bi_sector = sector + rdev->sb_start; else if (rdev->mddev->reshape_position != MaxSector && (rdev->mddev->reshape_backwards == (sector >= rdev->mddev->reshape_position))) bio.bi_iter.bi_sector = sector + rdev->new_data_offset; else bio.bi_iter.bi_sector = sector + rdev->data_offset; __bio_add_page(&bio, page, size, 0); submit_bio_wait(&bio); return !bio.bi_status; } EXPORT_SYMBOL_GPL(sync_page_io); static int read_disk_sb(struct md_rdev *rdev, int size) { if (rdev->sb_loaded) return 0; if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true)) goto fail; rdev->sb_loaded = 1; return 0; fail: pr_err("md: disabled device %pg, could not read superblock.\n", rdev->bdev); return -EINVAL; } static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) { return sb1->set_uuid0 == sb2->set_uuid0 && sb1->set_uuid1 == sb2->set_uuid1 && sb1->set_uuid2 == sb2->set_uuid2 && sb1->set_uuid3 == sb2->set_uuid3; } static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) { int ret; mdp_super_t *tmp1, *tmp2; tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); if (!tmp1 || !tmp2) { ret = 0; goto abort; } *tmp1 = *sb1; *tmp2 = *sb2; /* * nr_disks is not constant */ tmp1->nr_disks = 0; tmp2->nr_disks = 0; ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); abort: kfree(tmp1); kfree(tmp2); return ret; } static u32 md_csum_fold(u32 csum) { csum = (csum & 0xffff) + (csum >> 16); return (csum & 0xffff) + (csum >> 16); } static unsigned int calc_sb_csum(mdp_super_t *sb) { u64 newcsum = 0; u32 *sb32 = (u32*)sb; int i; unsigned int disk_csum, csum; disk_csum = sb->sb_csum; sb->sb_csum = 0; for (i = 0; i < MD_SB_BYTES/4 ; i++) newcsum += sb32[i]; csum = (newcsum & 0xffffffff) + (newcsum>>32); #ifdef CONFIG_ALPHA /* This used to use csum_partial, which was wrong for several * reasons including that different results are returned on * different architectures. It isn't critical that we get exactly * the same return value as before (we always csum_fold before * testing, and that removes any differences). However as we * know that csum_partial always returned a 16bit value on * alphas, do a fold to maximise conformity to previous behaviour. */ sb->sb_csum = md_csum_fold(disk_csum); #else sb->sb_csum = disk_csum; #endif return csum; } /* * Handle superblock details. * We want to be able to handle multiple superblock formats * so we have a common interface to them all, and an array of * different handlers. * We rely on user-space to write the initial superblock, and support * reading and updating of superblocks. * Interface methods are: * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) * loads and validates a superblock on dev. * if refdev != NULL, compare superblocks on both devices * Return: * 0 - dev has a superblock that is compatible with refdev * 1 - dev has a superblock that is compatible and newer than refdev * so dev should be used as the refdev in future * -EINVAL superblock incompatible or invalid * -othererror e.g. -EIO * * int validate_super(struct mddev *mddev, struct md_rdev *dev) * Verify that dev is acceptable into mddev. * The first time, mddev->raid_disks will be 0, and data from * dev should be merged in. Subsequent calls check that dev * is new enough. Return 0 or -EINVAL * * void sync_super(struct mddev *mddev, struct md_rdev *dev) * Update the superblock for rdev with data in mddev * This does not write to disc. * */ struct super_type { char *name; struct module *owner; int (*load_super)(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version); int (*validate_super)(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev); void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); unsigned long long (*rdev_size_change)(struct md_rdev *rdev, sector_t num_sectors); int (*allow_new_offset)(struct md_rdev *rdev, unsigned long long new_offset); }; /* * Check that the given mddev has no bitmap. * * This function is called from the run method of all personalities that do not * support bitmaps. It prints an error message and returns non-zero if mddev * has a bitmap. Otherwise, it returns 0. * */ int md_check_no_bitmap(struct mddev *mddev) { if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) return 0; pr_warn("%s: bitmaps are not supported for %s\n", mdname(mddev), mddev->pers->name); return 1; } EXPORT_SYMBOL(md_check_no_bitmap); /* * load_super for 0.90.0 */ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) { mdp_super_t *sb; int ret; bool spare_disk = true; /* * Calculate the position of the superblock (512byte sectors), * it's at the end of the disk. * * It also happens to be a multiple of 4Kb. */ rdev->sb_start = calc_dev_sboffset(rdev); ret = read_disk_sb(rdev, MD_SB_BYTES); if (ret) return ret; ret = -EINVAL; sb = page_address(rdev->sb_page); if (sb->md_magic != MD_SB_MAGIC) { pr_warn("md: invalid raid superblock magic on %pg\n", rdev->bdev); goto abort; } if (sb->major_version != 0 || sb->minor_version < 90 || sb->minor_version > 91) { pr_warn("Bad version number %d.%d on %pg\n", sb->major_version, sb->minor_version, rdev->bdev); goto abort; } if (sb->raid_disks <= 0) goto abort; if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev); goto abort; } rdev->preferred_minor = sb->md_minor; rdev->data_offset = 0; rdev->new_data_offset = 0; rdev->sb_size = MD_SB_BYTES; rdev->badblocks.shift = -1; rdev->desc_nr = sb->this_disk.number; /* not spare disk */ if (rdev->desc_nr >= 0 && rdev->desc_nr < MD_SB_DISKS && sb->disks[rdev->desc_nr].state & ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) spare_disk = false; if (!refdev) { if (!spare_disk) ret = 1; else ret = 0; } else { __u64 ev1, ev2; mdp_super_t *refsb = page_address(refdev->sb_page); if (!md_uuid_equal(refsb, sb)) { pr_warn("md: %pg has different UUID to %pg\n", rdev->bdev, refdev->bdev); goto abort; } if (!md_sb_equal(refsb, sb)) { pr_warn("md: %pg has same UUID but different superblock to %pg\n", rdev->bdev, refdev->bdev); goto abort; } ev1 = md_event(sb); ev2 = md_event(refsb); if (!spare_disk && ev1 > ev2) ret = 1; else ret = 0; } rdev->sectors = rdev->sb_start; /* Limit to 4TB as metadata cannot record more than that. * (not needed for Linear and RAID0 as metadata doesn't * record this size) */ if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1) rdev->sectors = (sector_t)(2ULL << 32) - 2; if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) /* "this cannot possibly happen" ... */ ret = -EINVAL; abort: return ret; } /* * validate_super for 0.90.0 * note: we are not using "freshest" for 0.9 superblock */ static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) { mdp_disk_t *desc; mdp_super_t *sb = page_address(rdev->sb_page); __u64 ev1 = md_event(sb); rdev->raid_disk = -1; clear_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); clear_bit(Bitmap_sync, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); if (mddev->raid_disks == 0) { mddev->major_version = 0; mddev->minor_version = sb->minor_version; mddev->patch_version = sb->patch_version; mddev->external = 0; mddev->chunk_sectors = sb->chunk_size >> 9; mddev->ctime = sb->ctime; mddev->utime = sb->utime; mddev->level = sb->level; mddev->clevel[0] = 0; mddev->layout = sb->layout; mddev->raid_disks = sb->raid_disks; mddev->dev_sectors = ((sector_t)sb->size) * 2; mddev->events = ev1; mddev->bitmap_info.offset = 0; mddev->bitmap_info.space = 0; /* bitmap can use 60 K after the 4K superblocks */ mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); mddev->reshape_backwards = 0; if (mddev->minor_version >= 91) { mddev->reshape_position = sb->reshape_position; mddev->delta_disks = sb->delta_disks; mddev->new_level = sb->new_level; mddev->new_layout = sb->new_layout; mddev->new_chunk_sectors = sb->new_chunk >> 9; if (mddev->delta_disks < 0) mddev->reshape_backwards = 1; } else { mddev->reshape_position = MaxSector; mddev->delta_disks = 0; mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; mddev->new_chunk_sectors = mddev->chunk_sectors; } if (mddev->level == 0) mddev->layout = -1; if (sb->state & (1<<MD_SB_CLEAN)) mddev->recovery_cp = MaxSector; else { if (sb->events_hi == sb->cp_events_hi && sb->events_lo == sb->cp_events_lo) { mddev->recovery_cp = sb->recovery_cp; } else mddev->recovery_cp = 0; } memcpy(mddev->uuid+0, &sb->set_uuid0, 4); memcpy(mddev->uuid+4, &sb->set_uuid1, 4); memcpy(mddev->uuid+8, &sb->set_uuid2, 4); memcpy(mddev->uuid+12,&sb->set_uuid3, 4); mddev->max_disks = MD_SB_DISKS; if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && mddev->bitmap_info.file == NULL) { mddev->bitmap_info.offset = mddev->bitmap_info.default_offset; mddev->bitmap_info.space = mddev->bitmap_info.default_space; } } else if (mddev->pers == NULL) { /* Insist on good event counter while assembling, except * for spares (which don't need an event count) */ ++ev1; if (sb->disks[rdev->desc_nr].state & ( (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) if (ev1 < mddev->events) return -EINVAL; } else if (mddev->bitmap) { /* if adding to array with a bitmap, then we can accept an * older device ... but not too old. */ if (ev1 < mddev->bitmap->events_cleared) return 0; if (ev1 < mddev->events) set_bit(Bitmap_sync, &rdev->flags); } else { if (ev1 < mddev->events) /* just a hot-add of a new device, leave raid_disk at -1 */ return 0; } desc = sb->disks + rdev->desc_nr; if (desc->state & (1<<MD_DISK_FAULTY)) set_bit(Faulty, &rdev->flags); else if (desc->state & (1<<MD_DISK_SYNC)) { set_bit(In_sync, &rdev->flags); rdev->raid_disk = desc->raid_disk; rdev->saved_raid_disk = desc->raid_disk; } else if (desc->state & (1<<MD_DISK_ACTIVE)) { /* active but not in sync implies recovery up to * reshape position. We don't know exactly where * that is, so set to zero for now */ if (mddev->minor_version >= 91) { rdev->recovery_offset = 0; rdev->raid_disk = desc->raid_disk; } } if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) set_bit(WriteMostly, &rdev->flags); if (desc->state & (1<<MD_DISK_FAILFAST)) set_bit(FailFast, &rdev->flags); return 0; } /* * sync_super for 0.90.0 */ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) { mdp_super_t *sb; struct md_rdev *rdev2; int next_spare = mddev->raid_disks; /* make rdev->sb match mddev data.. * * 1/ zero out disks * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); * 3/ any empty disks < next_spare become removed * * disks[0] gets initialised to REMOVED because * we cannot be sure from other fields if it has * been initialised or not. */ int i; int active=0, working=0,failed=0,spare=0,nr_disks=0; rdev->sb_size = MD_SB_BYTES; sb = page_address(rdev->sb_page); memset(sb, 0, sizeof(*sb)); sb->md_magic = MD_SB_MAGIC; sb->major_version = mddev->major_version; sb->patch_version = mddev->patch_version; sb->gvalid_words = 0; /* ignored */ memcpy(&sb->set_uuid0, mddev->uuid+0, 4); memcpy(&sb->set_uuid1, mddev->uuid+4, 4); memcpy(&sb->set_uuid2, mddev->uuid+8, 4); memcpy(&sb->set_uuid3, mddev->uuid+12,4); sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); sb->level = mddev->level; sb->size = mddev->dev_sectors / 2; sb->raid_disks = mddev->raid_disks; sb->md_minor = mddev->md_minor; sb->not_persistent = 0; sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); sb->state = 0; sb->events_hi = (mddev->events>>32); sb->events_lo = (u32)mddev->events; if (mddev->reshape_position == MaxSector) sb->minor_version = 90; else { sb->minor_version = 91; sb->reshape_position = mddev->reshape_position; sb->new_level = mddev->new_level; sb->delta_disks = mddev->delta_disks; sb->new_layout = mddev->new_layout; sb->new_chunk = mddev->new_chunk_sectors << 9; } mddev->minor_version = sb->minor_version; if (mddev->in_sync) { sb->recovery_cp = mddev->recovery_cp; sb->cp_events_hi = (mddev->events>>32); sb->cp_events_lo = (u32)mddev->events; if (mddev->recovery_cp == MaxSector) sb->state = (1<< MD_SB_CLEAN); } else sb->recovery_cp = 0; sb->layout = mddev->layout; sb->chunk_size = mddev->chunk_sectors << 9; if (mddev->bitmap && mddev->bitmap_info.file == NULL) sb->state |= (1<<MD_SB_BITMAP_PRESENT); sb->disks[0].state = (1<<MD_DISK_REMOVED); rdev_for_each(rdev2, mddev) { mdp_disk_t *d; int desc_nr; int is_active = test_bit(In_sync, &rdev2->flags); if (rdev2->raid_disk >= 0 && sb->minor_version >= 91) /* we have nowhere to store the recovery_offset, * but if it is not below the reshape_position, * we can piggy-back on that. */ is_active = 1; if (rdev2->raid_disk < 0 || test_bit(Faulty, &rdev2->flags)) is_active = 0; if (is_active) desc_nr = rdev2->raid_disk; else desc_nr = next_spare++; rdev2->desc_nr = desc_nr; d = &sb->disks[rdev2->desc_nr]; nr_disks++; d->number = rdev2->desc_nr; d->major = MAJOR(rdev2->bdev->bd_dev); d->minor = MINOR(rdev2->bdev->bd_dev); if (is_active) d->raid_disk = rdev2->raid_disk; else d->raid_disk = rdev2->desc_nr; /* compatibility */ if (test_bit(Faulty, &rdev2->flags)) d->state = (1<<MD_DISK_FAULTY); else if (is_active) { d->state = (1<<MD_DISK_ACTIVE); if (test_bit(In_sync, &rdev2->flags)) d->state |= (1<<MD_DISK_SYNC); active++; working++; } else { d->state = 0; spare++; working++; } if (test_bit(WriteMostly, &rdev2->flags)) d->state |= (1<<MD_DISK_WRITEMOSTLY); if (test_bit(FailFast, &rdev2->flags)) d->state |= (1<<MD_DISK_FAILFAST); } /* now set the "removed" and "faulty" bits on any missing devices */ for (i=0 ; i < mddev->raid_disks ; i++) { mdp_disk_t *d = &sb->disks[i]; if (d->state == 0 && d->number == 0) { d->number = i; d->raid_disk = i; d->state = (1<<MD_DISK_REMOVED); d->state |= (1<<MD_DISK_FAULTY); failed++; } } sb->nr_disks = nr_disks; sb->active_disks = active; sb->working_disks = working; sb->failed_disks = failed; sb->spare_disks = spare; sb->this_disk = sb->disks[rdev->desc_nr]; sb->sb_csum = calc_sb_csum(sb); } /* * rdev_size_change for 0.90.0 */ static unsigned long long super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) { if (num_sectors && num_sectors < rdev->mddev->dev_sectors) return 0; /* component must fit device */ if (rdev->mddev->bitmap_info.offset) return 0; /* can't move bitmap */ rdev->sb_start = calc_dev_sboffset(rdev); if (!num_sectors || num_sectors > rdev->sb_start) num_sectors = rdev->sb_start; /* Limit to 4TB as metadata cannot record more than that. * 4TB == 2^32 KB, or 2*2^32 sectors. */ if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) num_sectors = (sector_t)(2ULL << 32) - 2; do { md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, rdev->sb_page); } while (md_super_wait(rdev->mddev) < 0); return num_sectors; } static int super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) { /* non-zero offset changes not possible with v0.90 */ return new_offset == 0; } /* * version 1 superblock */ static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) { __le32 disk_csum; u32 csum; unsigned long long newcsum; int size = 256 + le32_to_cpu(sb->max_dev)*2; __le32 *isuper = (__le32*)sb; disk_csum = sb->sb_csum; sb->sb_csum = 0; newcsum = 0; for (; size >= 4; size -= 4) newcsum += le32_to_cpu(*isuper++); if (size == 2) newcsum += le16_to_cpu(*(__le16*) isuper); csum = (newcsum & 0xffffffff) + (newcsum >> 32); sb->sb_csum = disk_csum; return cpu_to_le32(csum); } static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) { struct mdp_superblock_1 *sb; int ret; sector_t sb_start; sector_t sectors; int bmask; bool spare_disk = true; /* * Calculate the position of the superblock in 512byte sectors. * It is always aligned to a 4K boundary and * depeding on minor_version, it can be: * 0: At least 8K, but less than 12K, from end of device * 1: At start of device * 2: 4K from start of device. */ switch(minor_version) { case 0: sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2; sb_start &= ~(sector_t)(4*2-1); break; case 1: sb_start = 0; break; case 2: sb_start = 8; break; default: return -EINVAL; } rdev->sb_start = sb_start; /* superblock is rarely larger than 1K, but it can be larger, * and it is safe to read 4k, so we do that */ ret = read_disk_sb(rdev, 4096); if (ret) return ret; sb = page_address(rdev->sb_page); if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || sb->major_version != cpu_to_le32(1) || le32_to_cpu(sb->max_dev) > (4096-256)/2 || le64_to_cpu(sb->super_offset) != rdev->sb_start || (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) return -EINVAL; if (calc_sb_1_csum(sb) != sb->sb_csum) { pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev); return -EINVAL; } if (le64_to_cpu(sb->data_size) < 10) { pr_warn("md: data_size too small on %pg\n", rdev->bdev); return -EINVAL; } if (sb->pad0 || sb->pad3[0] || memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) /* Some padding is non-zero, might be a new feature */ return -EINVAL; rdev->preferred_minor = 0xffff; rdev->data_offset = le64_to_cpu(sb->data_offset); rdev->new_data_offset = rdev->data_offset; if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; if (rdev->sb_size & bmask) rdev->sb_size = (rdev->sb_size | bmask) + 1; if (minor_version && rdev->data_offset < sb_start + (rdev->sb_size/512)) return -EINVAL; if (minor_version && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) return -EINVAL; rdev->desc_nr = le32_to_cpu(sb->dev_number); if (!rdev->bb_page) { rdev->bb_page = alloc_page(GFP_KERNEL); if (!rdev->bb_page) return -ENOMEM; } if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && rdev->badblocks.count == 0) { /* need to load the bad block list. * Currently we limit it to one page. */ s32 offset; sector_t bb_sector; __le64 *bbp; int i; int sectors = le16_to_cpu(sb->bblog_size); if (sectors > (PAGE_SIZE / 512)) return -EINVAL; offset = le32_to_cpu(sb->bblog_offset); if (offset == 0) return -EINVAL; bb_sector = (long long)offset; if (!sync_page_io(rdev, bb_sector, sectors << 9, rdev->bb_page, REQ_OP_READ, true)) return -EIO; bbp = (__le64 *)page_address(rdev->bb_page); rdev->badblocks.shift = sb->bblog_shift; for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { u64 bb = le64_to_cpu(*bbp); int count = bb & (0x3ff); u64 sector = bb >> 10; sector <<= sb->bblog_shift; count <<= sb->bblog_shift; if (bb + 1 == 0) break; if (badblocks_set(&rdev->badblocks, sector, count, 1)) return -EINVAL; } } else if (sb->bblog_offset != 0) rdev->badblocks.shift = 0; if ((le32_to_cpu(sb->feature_map) & (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) { rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset); rdev->ppl.size = le16_to_cpu(sb->ppl.size); rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; } if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) && sb->level != 0) return -EINVAL; /* not spare disk */ if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) && (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) spare_disk = false; if (!refdev) { if (!spare_disk) ret = 1; else ret = 0; } else { __u64 ev1, ev2; struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || sb->level != refsb->level || sb->layout != refsb->layout || sb->chunksize != refsb->chunksize) { pr_warn("md: %pg has strangely different superblock to %pg\n", rdev->bdev, refdev->bdev); return -EINVAL; } ev1 = le64_to_cpu(sb->events); ev2 = le64_to_cpu(refsb->events); if (!spare_disk && ev1 > ev2) ret = 1; else ret = 0; } if (minor_version) sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; else sectors = rdev->sb_start; if (sectors < le64_to_cpu(sb->data_size)) return -EINVAL; rdev->sectors = le64_to_cpu(sb->data_size); return ret; } static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) { struct mdp_superblock_1 *sb = page_address(rdev->sb_page); __u64 ev1 = le64_to_cpu(sb->events); int role; rdev->raid_disk = -1; clear_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); clear_bit(Bitmap_sync, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); if (mddev->raid_disks == 0) { mddev->major_version = 1; mddev->patch_version = 0; mddev->external = 0; mddev->chunk_sectors = le32_to_cpu(sb->chunksize); mddev->ctime = le64_to_cpu(sb->ctime); mddev->utime = le64_to_cpu(sb->utime); mddev->level = le32_to_cpu(sb->level); mddev->clevel[0] = 0; mddev->layout = le32_to_cpu(sb->layout); mddev->raid_disks = le32_to_cpu(sb->raid_disks); mddev->dev_sectors = le64_to_cpu(sb->size); mddev->events = ev1; mddev->bitmap_info.offset = 0; mddev->bitmap_info.space = 0; /* Default location for bitmap is 1K after superblock * using 3K - total of 4K */ mddev->bitmap_info.default_offset = 1024 >> 9; mddev->bitmap_info.default_space = (4096-1024) >> 9; mddev->reshape_backwards = 0; mddev->recovery_cp = le64_to_cpu(sb->resync_offset); memcpy(mddev->uuid, sb->set_uuid, 16); mddev->max_disks = (4096-256)/2; if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && mddev->bitmap_info.file == NULL) { mddev->bitmap_info.offset = (__s32)le32_to_cpu(sb->bitmap_offset); /* Metadata doesn't record how much space is available. * For 1.0, we assume we can use up to the superblock * if before, else to 4K beyond superblock. * For others, assume no change is possible. */ if (mddev->minor_version > 0) mddev->bitmap_info.space = 0; else if (mddev->bitmap_info.offset > 0) mddev->bitmap_info.space = 8 - mddev->bitmap_info.offset; else mddev->bitmap_info.space = -mddev->bitmap_info.offset; } if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { mddev->reshape_position = le64_to_cpu(sb->reshape_position); mddev->delta_disks = le32_to_cpu(sb->delta_disks); mddev->new_level = le32_to_cpu(sb->new_level); mddev->new_layout = le32_to_cpu(sb->new_layout); mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); if (mddev->delta_disks < 0 || (mddev->delta_disks == 0 && (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_BACKWARDS))) mddev->reshape_backwards = 1; } else { mddev->reshape_position = MaxSector; mddev->delta_disks = 0; mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; mddev->new_chunk_sectors = mddev->chunk_sectors; } if (mddev->level == 0 && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT)) mddev->layout = -1; if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) set_bit(MD_HAS_JOURNAL, &mddev->flags); if (le32_to_cpu(sb->feature_map) & (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) { if (le32_to_cpu(sb->feature_map) & (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) return -EINVAL; if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) && (le32_to_cpu(sb->feature_map) & MD_FEATURE_MULTIPLE_PPLS)) return -EINVAL; set_bit(MD_HAS_PPL, &mddev->flags); } } else if (mddev->pers == NULL) { /* Insist of good event counter while assembling, except for * spares (which don't need an event count). * Similar to mdadm, we allow event counter difference of 1 * from the freshest device. */ if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) && (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) if (ev1 + 1 < mddev->events) return -EINVAL; } else if (mddev->bitmap) { /* If adding to array with a bitmap, then we can accept an * older device, but not too old. */ if (ev1 < mddev->bitmap->events_cleared) return 0; if (ev1 < mddev->events) set_bit(Bitmap_sync, &rdev->flags); } else { if (ev1 < mddev->events) /* just a hot-add of a new device, leave raid_disk at -1 */ return 0; } if (rdev->desc_nr < 0 || rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { role = MD_DISK_ROLE_SPARE; rdev->desc_nr = -1; } else if (mddev->pers == NULL && freshest && ev1 < mddev->events) { /* * If we are assembling, and our event counter is smaller than the * highest event counter, we cannot trust our superblock about the role. * It could happen that our rdev was marked as Faulty, and all other * superblocks were updated with +1 event counter. * Then, before the next superblock update, which typically happens when * remove_and_add_spares() removes the device from the array, there was * a crash or reboot. * If we allow current rdev without consulting the freshest superblock, * we could cause data corruption. * Note that in this case our event counter is smaller by 1 than the * highest, otherwise, this rdev would not be allowed into array; * both kernel and mdadm allow event counter difference of 1. */ struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page); u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev); if (rdev->desc_nr >= freshest_max_dev) { /* this is unexpected, better not proceed */ pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n", mdname(mddev), rdev->bdev, rdev->desc_nr, freshest->bdev, freshest_max_dev); return -EUCLEAN; } role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]); pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n", mdname(mddev), rdev->bdev, role, role, freshest->bdev); } else { role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); } switch (role) { case MD_DISK_ROLE_SPARE: /* spare */ break; case MD_DISK_ROLE_FAULTY: /* faulty */ set_bit(Faulty, &rdev->flags); break; case MD_DISK_ROLE_JOURNAL: /* journal device */ if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { /* journal device without journal feature */ pr_warn("md: journal device provided without journal feature, ignoring the device\n"); return -EINVAL; } set_bit(Journal, &rdev->flags); rdev->journal_tail = le64_to_cpu(sb->journal_tail); rdev->raid_disk = 0; break; default: rdev->saved_raid_disk = role; if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) { rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_BITMAP)) rdev->saved_raid_disk = -1; } else { /* * If the array is FROZEN, then the device can't * be in_sync with rest of array. */ if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) set_bit(In_sync, &rdev->flags); } rdev->raid_disk = role; break; } if (sb->devflags & WriteMostly1) set_bit(WriteMostly, &rdev->flags); if (sb->devflags & FailFast1) set_bit(FailFast, &rdev->flags); if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) set_bit(Replacement, &rdev->flags); return 0; } static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) { struct mdp_superblock_1 *sb; struct md_rdev *rdev2; int max_dev, i; /* make rdev->sb match mddev and rdev data. */ sb = page_address(rdev->sb_page); sb->feature_map = 0; sb->pad0 = 0; sb->recovery_offset = cpu_to_le64(0); memset(sb->pad3, 0, sizeof(sb->pad3)); sb->utime = cpu_to_le64((__u64)mddev->utime); sb->events = cpu_to_le64(mddev->events); if (mddev->in_sync) sb->resync_offset = cpu_to_le64(mddev->recovery_cp); else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) sb->resync_offset = cpu_to_le64(MaxSector); else sb->resync_offset = cpu_to_le64(0); sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); sb->raid_disks = cpu_to_le32(mddev->raid_disks); sb->size = cpu_to_le64(mddev->dev_sectors); sb->chunksize = cpu_to_le32(mddev->chunk_sectors); sb->level = cpu_to_le32(mddev->level); sb->layout = cpu_to_le32(mddev->layout); if (test_bit(FailFast, &rdev->flags)) sb->devflags |= FailFast1; else sb->devflags &= ~FailFast1; if (test_bit(WriteMostly, &rdev->flags)) sb->devflags |= WriteMostly1; else sb->devflags &= ~WriteMostly1; sb->data_offset = cpu_to_le64(rdev->data_offset); sb->data_size = cpu_to_le64(rdev->sectors); if (mddev->bitmap && mddev->bitmap_info.file == NULL) { sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); } if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && !test_bit(In_sync, &rdev->flags)) { sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); if (rdev->saved_raid_disk >= 0 && mddev->bitmap) sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); } /* Note: recovery_offset and journal_tail share space */ if (test_bit(Journal, &rdev->flags)) sb->journal_tail = cpu_to_le64(rdev->journal_tail); if (test_bit(Replacement, &rdev->flags)) sb->feature_map |= cpu_to_le32(MD_FEATURE_REPLACEMENT); if (mddev->reshape_position != MaxSector) { sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); sb->reshape_position = cpu_to_le64(mddev->reshape_position); sb->new_layout = cpu_to_le32(mddev->new_layout); sb->delta_disks = cpu_to_le32(mddev->delta_disks); sb->new_level = cpu_to_le32(mddev->new_level); sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); if (mddev->delta_disks == 0 && mddev->reshape_backwards) sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); if (rdev->new_data_offset != rdev->data_offset) { sb->feature_map |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset - rdev->data_offset)); } } if (mddev_is_clustered(mddev)) sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); if (rdev->badblocks.count == 0) /* Nothing to do for bad blocks*/ ; else if (sb->bblog_offset == 0) /* Cannot record bad blocks on this device */ md_error(mddev, rdev); else { struct badblocks *bb = &rdev->badblocks; __le64 *bbp = (__le64 *)page_address(rdev->bb_page); u64 *p = bb->page; sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); if (bb->changed) { unsigned seq; retry: seq = read_seqbegin(&bb->lock); memset(bbp, 0xff, PAGE_SIZE); for (i = 0 ; i < bb->count ; i++) { u64 internal_bb = p[i]; u64 store_bb = ((BB_OFFSET(internal_bb) << 10) | BB_LEN(internal_bb)); bbp[i] = cpu_to_le64(store_bb); } bb->changed = 0; if (read_seqretry(&bb->lock, seq)) goto retry; bb->sector = (rdev->sb_start + (int)le32_to_cpu(sb->bblog_offset)); bb->size = le16_to_cpu(sb->bblog_size); } } max_dev = 0; rdev_for_each(rdev2, mddev) if (rdev2->desc_nr+1 > max_dev) max_dev = rdev2->desc_nr+1; if (max_dev > le32_to_cpu(sb->max_dev)) { int bmask; sb->max_dev = cpu_to_le32(max_dev); rdev->sb_size = max_dev * 2 + 256; bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; if (rdev->sb_size & bmask) rdev->sb_size = (rdev->sb_size | bmask) + 1; } else max_dev = le32_to_cpu(sb->max_dev); for (i=0; i<max_dev;i++) sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); if (test_bit(MD_HAS_PPL, &mddev->flags)) { if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags)) sb->feature_map |= cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS); else sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); sb->ppl.offset = cpu_to_le16(rdev->ppl.offset); sb->ppl.size = cpu_to_le16(rdev->ppl.size); } rdev_for_each(rdev2, mddev) { i = rdev2->desc_nr; if (test_bit(Faulty, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); else if (test_bit(In_sync, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); else if (test_bit(Journal, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); else if (rdev2->raid_disk >= 0) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); else sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); } sb->sb_csum = calc_sb_1_csum(sb); } static sector_t super_1_choose_bm_space(sector_t dev_size) { sector_t bm_space; /* if the device is bigger than 8Gig, save 64k for bitmap * usage, if bigger than 200Gig, save 128k */ if (dev_size < 64*2) bm_space = 0; else if (dev_size - 64*2 >= 200*1024*1024*2) bm_space = 128*2; else if (dev_size - 4*2 > 8*1024*1024*2) bm_space = 64*2; else bm_space = 4*2; return bm_space; } static unsigned long long super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) { struct mdp_superblock_1 *sb; sector_t max_sectors; if (num_sectors && num_sectors < rdev->mddev->dev_sectors) return 0; /* component must fit device */ if (rdev->data_offset != rdev->new_data_offset) return 0; /* too confusing */ if (rdev->sb_start < rdev->data_offset) { /* minor versions 1 and 2; superblock before data */ max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; if (!num_sectors || num_sectors > max_sectors) num_sectors = max_sectors; } else if (rdev->mddev->bitmap_info.offset) { /* minor version 0 with bitmap we can't move */ return 0; } else { /* minor version 0; superblock after data */ sector_t sb_start, bm_space; sector_t dev_size = bdev_nr_sectors(rdev->bdev); /* 8K is for superblock */ sb_start = dev_size - 8*2; sb_start &= ~(sector_t)(4*2 - 1); bm_space = super_1_choose_bm_space(dev_size); /* Space that can be used to store date needs to decrease * superblock bitmap space and bad block space(4K) */ max_sectors = sb_start - bm_space - 4*2; if (!num_sectors || num_sectors > max_sectors) num_sectors = max_sectors; rdev->sb_start = sb_start; } sb = page_address(rdev->sb_page); sb->data_size = cpu_to_le64(num_sectors); sb->super_offset = cpu_to_le64(rdev->sb_start); sb->sb_csum = calc_sb_1_csum(sb); do { md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, rdev->sb_page); } while (md_super_wait(rdev->mddev) < 0); return num_sectors; } static int super_1_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) { /* All necessary checks on new >= old have been done */ struct bitmap *bitmap; if (new_offset >= rdev->data_offset) return 1; /* with 1.0 metadata, there is no metadata to tread on * so we can always move back */ if (rdev->mddev->minor_version == 0) return 1; /* otherwise we must be sure not to step on * any metadata, so stay: * 36K beyond start of superblock * beyond end of badblocks * beyond write-intent bitmap */ if (rdev->sb_start + (32+4)*2 > new_offset) return 0; bitmap = rdev->mddev->bitmap; if (bitmap && !rdev->mddev->bitmap_info.file && rdev->sb_start + rdev->mddev->bitmap_info.offset + bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) return 0; if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) return 0; return 1; } static struct super_type super_types[] = { [0] = { .name = "0.90.0", .owner = THIS_MODULE, .load_super = super_90_load, .validate_super = super_90_validate, .sync_super = super_90_sync, .rdev_size_change = super_90_rdev_size_change, .allow_new_offset = super_90_allow_new_offset, }, [1] = { .name = "md-1", .owner = THIS_MODULE, .load_super = super_1_load, .validate_super = super_1_validate, .sync_super = super_1_sync, .rdev_size_change = super_1_rdev_size_change, .allow_new_offset = super_1_allow_new_offset, }, }; static void sync_super(struct mddev *mddev, struct md_rdev *rdev) { if (mddev->sync_super) { mddev->sync_super(mddev, rdev); return; } BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); super_types[mddev->major_version].sync_super(mddev, rdev); } static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) { struct md_rdev *rdev, *rdev2; rcu_read_lock(); rdev_for_each_rcu(rdev, mddev1) { if (test_bit(Faulty, &rdev->flags) || test_bit(Journal, &rdev->flags) || rdev->raid_disk == -1) continue; rdev_for_each_rcu(rdev2, mddev2) { if (test_bit(Faulty, &rdev2->flags) || test_bit(Journal, &rdev2->flags) || rdev2->raid_disk == -1) continue; if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) { rcu_read_unlock(); return 1; } } } rcu_read_unlock(); return 0; } static LIST_HEAD(pending_raid_disks); /* * Try to register data integrity profile for an mddev * * This is called when an array is started and after a disk has been kicked * from the array. It only succeeds if all working and active component devices * are integrity capable with matching profiles. */ int md_integrity_register(struct mddev *mddev) { struct md_rdev *rdev, *reference = NULL; if (list_empty(&mddev->disks)) return 0; /* nothing to do */ if (mddev_is_dm(mddev) || blk_get_integrity(mddev->gendisk)) return 0; /* shouldn't register, or already is */ rdev_for_each(rdev, mddev) { /* skip spares and non-functional disks */ if (test_bit(Faulty, &rdev->flags)) continue; if (rdev->raid_disk < 0) continue; if (!reference) { /* Use the first rdev as the reference */ reference = rdev; continue; } /* does this rdev's profile match the reference profile? */ if (blk_integrity_compare(reference->bdev->bd_disk, rdev->bdev->bd_disk) < 0) return -EINVAL; } if (!reference || !bdev_get_integrity(reference->bdev)) return 0; /* * All component devices are integrity capable and have matching * profiles, register the common profile for the md device. */ blk_integrity_register(mddev->gendisk, bdev_get_integrity(reference->bdev)); pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || (mddev->level != 1 && mddev->level != 10 && bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) { /* * No need to handle the failure of bioset_integrity_create, * because the function is called by md_run() -> pers->run(), * md_run calls bioset_exit -> bioset_integrity_free in case * of failure case. */ pr_err("md: failed to create integrity pool for %s\n", mdname(mddev)); return -EINVAL; } return 0; } EXPORT_SYMBOL(md_integrity_register); /* * Attempt to add an rdev, but only if it is consistent with the current * integrity profile */ int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) { struct blk_integrity *bi_mddev; if (mddev_is_dm(mddev)) return 0; bi_mddev = blk_get_integrity(mddev->gendisk); if (!bi_mddev) /* nothing to do */ return 0; if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { pr_err("%s: incompatible integrity profile for %pg\n", mdname(mddev), rdev->bdev); return -ENXIO; } return 0; } EXPORT_SYMBOL(md_integrity_add_rdev); static bool rdev_read_only(struct md_rdev *rdev) { return bdev_read_only(rdev->bdev) || (rdev->meta_bdev && bdev_read_only(rdev->meta_bdev)); } static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) { char b[BDEVNAME_SIZE]; int err; /* prevent duplicates */ if (find_rdev(mddev, rdev->bdev->bd_dev)) return -EEXIST; if (rdev_read_only(rdev) && mddev->pers) return -EROFS; /* make sure rdev->sectors exceeds mddev->dev_sectors */ if (!test_bit(Journal, &rdev->flags) && rdev->sectors && (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { if (mddev->pers) { /* Cannot change size, so fail * If mddev->level <= 0, then we don't care * about aligning sizes (e.g. linear) */ if (mddev->level > 0) return -ENOSPC; } else mddev->dev_sectors = rdev->sectors; } /* Verify rdev->desc_nr is unique. * If it is -1, assign a free number, else * check number is not in use */ rcu_read_lock(); if (rdev->desc_nr < 0) { int choice = 0; if (mddev->pers) choice = mddev->raid_disks; while (md_find_rdev_nr_rcu(mddev, choice)) choice++; rdev->desc_nr = choice; } else { if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { rcu_read_unlock(); return -EBUSY; } } rcu_read_unlock(); if (!test_bit(Journal, &rdev->flags) && mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { pr_warn("md: %s: array is limited to %d devices\n", mdname(mddev), mddev->max_disks); return -EBUSY; } snprintf(b, sizeof(b), "%pg", rdev->bdev); strreplace(b, '/', '!'); rdev->mddev = mddev; pr_debug("md: bind<%s>\n", b); if (mddev->raid_disks) mddev_create_serial_pool(mddev, rdev); if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) goto fail; /* failure here is OK */ err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block"); rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); rdev->sysfs_unack_badblocks = sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks"); rdev->sysfs_badblocks = sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks"); list_add_rcu(&rdev->same_set, &mddev->disks); bd_link_disk_holder(rdev->bdev, mddev->gendisk); /* May as well allow recovery to be retried once */ mddev->recovery_disabled++; return 0; fail: pr_warn("md: failed to register dev-%s for %s\n", b, mdname(mddev)); mddev_destroy_serial_pool(mddev, rdev); return err; } void md_autodetect_dev(dev_t dev); /* just for claiming the bdev */ static struct md_rdev claim_rdev; static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) { pr_debug("md: export_rdev(%pg)\n", rdev->bdev); md_rdev_clear(rdev); #ifndef MODULE if (test_bit(AutoDetected, &rdev->flags)) md_autodetect_dev(rdev->bdev->bd_dev); #endif fput(rdev->bdev_file); rdev->bdev = NULL; kobject_put(&rdev->kobj); } static void md_kick_rdev_from_array(struct md_rdev *rdev) { struct mddev *mddev = rdev->mddev; bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); pr_debug("md: unbind<%pg>\n", rdev->bdev); mddev_destroy_serial_pool(rdev->mddev, rdev); WRITE_ONCE(rdev->mddev, NULL); sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); sysfs_put(rdev->sysfs_unack_badblocks); sysfs_put(rdev->sysfs_badblocks); rdev->sysfs_state = NULL; rdev->sysfs_unack_badblocks = NULL; rdev->sysfs_badblocks = NULL; rdev->badblocks.count = 0; synchronize_rcu(); /* * kobject_del() will wait for all in progress writers to be done, where * reconfig_mutex is held, hence it can't be called under * reconfig_mutex and it's delayed to mddev_unlock(). */ list_add(&rdev->same_set, &mddev->deleting); } static void export_array(struct mddev *mddev) { struct md_rdev *rdev; while (!list_empty(&mddev->disks)) { rdev = list_first_entry(&mddev->disks, struct md_rdev, same_set); md_kick_rdev_from_array(rdev); } mddev->raid_disks = 0; mddev->major_version = 0; } static bool set_in_sync(struct mddev *mddev) { lockdep_assert_held(&mddev->lock); if (!mddev->in_sync) { mddev->sync_checkers++; spin_unlock(&mddev->lock); percpu_ref_switch_to_atomic_sync(&mddev->writes_pending); spin_lock(&mddev->lock); if (!mddev->in_sync && percpu_ref_is_zero(&mddev->writes_pending)) { mddev->in_sync = 1; /* * Ensure ->in_sync is visible before we clear * ->sync_checkers. */ smp_mb(); set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); sysfs_notify_dirent_safe(mddev->sysfs_state); } if (--mddev->sync_checkers == 0) percpu_ref_switch_to_percpu(&mddev->writes_pending); } if (mddev->safemode == 1) mddev->safemode = 0; return mddev->in_sync; } static void sync_sbs(struct mddev *mddev, int nospares) { /* Update each superblock (in-memory image), but * if we are allowed to, skip spares which already * have the right event counter, or have one earlier * (which would mean they aren't being marked as dirty * with the rest of the array) */ struct md_rdev *rdev; rdev_for_each(rdev, mddev) { if (rdev->sb_events == mddev->events || (nospares && rdev->raid_disk < 0 && rdev->sb_events+1 == mddev->events)) { /* Don't update this superblock */ rdev->sb_loaded = 2; } else { sync_super(mddev, rdev); rdev->sb_loaded = 1; } } } static bool does_sb_need_changing(struct mddev *mddev) { struct md_rdev *rdev = NULL, *iter; struct mdp_superblock_1 *sb; int role; /* Find a good rdev */ rdev_for_each(iter, mddev) if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) { rdev = iter; break; } /* No good device found. */ if (!rdev) return false; sb = page_address(rdev->sb_page); /* Check if a device has become faulty or a spare become active */ rdev_for_each(rdev, mddev) { role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); /* Device activated? */ if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) return true; /* Device turned faulty? */ if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX)) return true; } /* Check if any mddev parameters have changed */ if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || (mddev->layout != le32_to_cpu(sb->layout)) || (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) return true; return false; } void md_update_sb(struct mddev *mddev, int force_change) { struct md_rdev *rdev; int sync_req; int nospares = 0; int any_badblocks_changed = 0; int ret = -1; if (!md_is_rdwr(mddev)) { if (force_change) set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); return; } repeat: if (mddev_is_clustered(mddev)) { if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) force_change = 1; if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) nospares = 1; ret = md_cluster_ops->metadata_update_start(mddev); /* Has someone else has updated the sb */ if (!does_sb_need_changing(mddev)) { if (ret == 0) md_cluster_ops->metadata_update_cancel(mddev); bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)); return; } } /* * First make sure individual recovery_offsets are correct * curr_resync_completed can only be used during recovery. * During reshape/resync it might use array-addresses rather * that device addresses. */ rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 && mddev->delta_disks >= 0 && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && !test_bit(Journal, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && mddev->curr_resync_completed > rdev->recovery_offset) rdev->recovery_offset = mddev->curr_resync_completed; } if (!mddev->persistent) { clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); if (!mddev->external) { clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); rdev_for_each(rdev, mddev) { if (rdev->badblocks.changed) { rdev->badblocks.changed = 0; ack_all_badblocks(&rdev->badblocks); md_error(mddev, rdev); } clear_bit(Blocked, &rdev->flags); clear_bit(BlockedBadBlocks, &rdev->flags); wake_up(&rdev->blocked_wait); } } wake_up(&mddev->sb_wait); return; } spin_lock(&mddev->lock); mddev->utime = ktime_get_real_seconds(); if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) force_change = 1; if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) /* just a clean<-> dirty transition, possibly leave spares alone, * though if events isn't the right even/odd, we will have to do * spares after all */ nospares = 1; if (force_change) nospares = 0; if (mddev->degraded) /* If the array is degraded, then skipping spares is both * dangerous and fairly pointless. * Dangerous because a device that was removed from the array * might have a event_count that still looks up-to-date, * so it can be re-added without a resync. * Pointless because if there are any spares to skip, * then a recovery will happen and soon that array won't * be degraded any more and the spare can go back to sleep then. */ nospares = 0; sync_req = mddev->in_sync; /* If this is just a dirty<->clean transition, and the array is clean * and 'events' is odd, we can roll back to the previous clean state */ if (nospares && (mddev->in_sync && mddev->recovery_cp == MaxSector) && mddev->can_decrease_events && mddev->events != 1) { mddev->events--; mddev->can_decrease_events = 0; } else { /* otherwise we have to go forward and ... */ mddev->events ++; mddev->can_decrease_events = nospares; } /* * This 64-bit counter should never wrap. * Either we are in around ~1 trillion A.C., assuming * 1 reboot per second, or we have a bug... */ WARN_ON(mddev->events == 0); rdev_for_each(rdev, mddev) { if (rdev->badblocks.changed) any_badblocks_changed++; if (test_bit(Faulty, &rdev->flags)) set_bit(FaultRecorded, &rdev->flags); } sync_sbs(mddev, nospares); spin_unlock(&mddev->lock); pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", mdname(mddev), mddev->in_sync); mddev_add_trace_msg(mddev, "md md_update_sb"); rewrite: md_bitmap_update_sb(mddev->bitmap); rdev_for_each(rdev, mddev) { if (rdev->sb_loaded != 1) continue; /* no noise on spare devices */ if (!test_bit(Faulty, &rdev->flags)) { md_super_write(mddev,rdev, rdev->sb_start, rdev->sb_size, rdev->sb_page); pr_debug("md: (write) %pg's sb offset: %llu\n", rdev->bdev, (unsigned long long)rdev->sb_start); rdev->sb_events = mddev->events; if (rdev->badblocks.size) { md_super_write(mddev, rdev, rdev->badblocks.sector, rdev->badblocks.size << 9, rdev->bb_page); rdev->badblocks.size = 0; } } else pr_debug("md: %pg (skipping faulty)\n", rdev->bdev); } if (md_super_wait(mddev) < 0) goto rewrite; /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ if (mddev_is_clustered(mddev) && ret == 0) md_cluster_ops->metadata_update_finish(mddev); if (mddev->in_sync != sync_req || !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) /* have to write it out again */ goto repeat; wake_up(&mddev->sb_wait); if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) sysfs_notify_dirent_safe(mddev->sysfs_completed); rdev_for_each(rdev, mddev) { if (test_and_clear_bit(FaultRecorded, &rdev->flags)) clear_bit(Blocked, &rdev->flags); if (any_badblocks_changed) ack_all_badblocks(&rdev->badblocks); clear_bit(BlockedBadBlocks, &rdev->flags); wake_up(&rdev->blocked_wait); } } EXPORT_SYMBOL(md_update_sb); static int add_bound_rdev(struct md_rdev *rdev) { struct mddev *mddev = rdev->mddev; int err = 0; bool add_journal = test_bit(Journal, &rdev->flags); if (!mddev->pers->hot_remove_disk || add_journal) { /* If there is hot_add_disk but no hot_remove_disk * then added disks for geometry changes, * and should be added immediately. */ super_types[mddev->major_version]. validate_super(mddev, NULL/*freshest*/, rdev); err = mddev->pers->hot_add_disk(mddev, rdev); if (err) { md_kick_rdev_from_array(rdev); return err; } } sysfs_notify_dirent_safe(rdev->sysfs_state); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); if (mddev->degraded) set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_new_event(); return 0; } /* words written to sysfs files may, or may not, be \n terminated. * We want to accept with case. For this we use cmd_match. */ static int cmd_match(const char *cmd, const char *str) { /* See if cmd, written into a sysfs file, matches * str. They must either be the same, or cmd can * have a trailing newline */ while (*cmd && *str && *cmd == *str) { cmd++; str++; } if (*cmd == '\n') cmd++; if (*str || *cmd) return 0; return 1; } struct rdev_sysfs_entry { struct attribute attr; ssize_t (*show)(struct md_rdev *, char *); ssize_t (*store)(struct md_rdev *, const char *, size_t); }; static ssize_t state_show(struct md_rdev *rdev, char *page) { char *sep = ","; size_t len = 0; unsigned long flags = READ_ONCE(rdev->flags); if (test_bit(Faulty, &flags) || (!test_bit(ExternalBbl, &flags) && rdev->badblocks.unacked_exist)) len += sprintf(page+len, "faulty%s", sep); if (test_bit(In_sync, &flags)) len += sprintf(page+len, "in_sync%s", sep); if (test_bit(Journal, &flags)) len += sprintf(page+len, "journal%s", sep); if (test_bit(WriteMostly, &flags)) len += sprintf(page+len, "write_mostly%s", sep); if (test_bit(Blocked, &flags) || (rdev->badblocks.unacked_exist && !test_bit(Faulty, &flags))) len += sprintf(page+len, "blocked%s", sep); if (!test_bit(Faulty, &flags) && !test_bit(Journal, &flags) && !test_bit(In_sync, &flags)) len += sprintf(page+len, "spare%s", sep); if (test_bit(WriteErrorSeen, &flags)) len += sprintf(page+len, "write_error%s", sep); if (test_bit(WantReplacement, &flags)) len += sprintf(page+len, "want_replacement%s", sep); if (test_bit(Replacement, &flags)) len += sprintf(page+len, "replacement%s", sep); if (test_bit(ExternalBbl, &flags)) len += sprintf(page+len, "external_bbl%s", sep); if (test_bit(FailFast, &flags)) len += sprintf(page+len, "failfast%s", sep); if (len) len -= strlen(sep); return len+sprintf(page+len, "\n"); } static ssize_t state_store(struct md_rdev *rdev, const char *buf, size_t len) { /* can write * faulty - simulates an error * remove - disconnects the device * writemostly - sets write_mostly * -writemostly - clears write_mostly * blocked - sets the Blocked flags * -blocked - clears the Blocked and possibly simulates an error * insync - sets Insync providing device isn't active * -insync - clear Insync for a device with a slot assigned, * so that it gets rebuilt based on bitmap * write_error - sets WriteErrorSeen * -write_error - clears WriteErrorSeen * {,-}failfast - set/clear FailFast */ struct mddev *mddev = rdev->mddev; int err = -EINVAL; bool need_update_sb = false; if (cmd_match(buf, "faulty") && rdev->mddev->pers) { md_error(rdev->mddev, rdev); if (test_bit(MD_BROKEN, &rdev->mddev->flags)) err = -EBUSY; else err = 0; } else if (cmd_match(buf, "remove")) { if (rdev->mddev->pers) { clear_bit(Blocked, &rdev->flags); remove_and_add_spares(rdev->mddev, rdev); } if (rdev->raid_disk >= 0) err = -EBUSY; else { err = 0; if (mddev_is_clustered(mddev)) err = md_cluster_ops->remove_disk(mddev, rdev); if (err == 0) { md_kick_rdev_from_array(rdev); if (mddev->pers) set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); md_new_event(); } } } else if (cmd_match(buf, "writemostly")) { set_bit(WriteMostly, &rdev->flags); mddev_create_serial_pool(rdev->mddev, rdev); need_update_sb = true; err = 0; } else if (cmd_match(buf, "-writemostly")) { mddev_destroy_serial_pool(rdev->mddev, rdev); clear_bit(WriteMostly, &rdev->flags); need_update_sb = true; err = 0; } else if (cmd_match(buf, "blocked")) { set_bit(Blocked, &rdev->flags); err = 0; } else if (cmd_match(buf, "-blocked")) { if (!test_bit(Faulty, &rdev->flags) && !test_bit(ExternalBbl, &rdev->flags) && rdev->badblocks.unacked_exist) { /* metadata handler doesn't understand badblocks, * so we need to fail the device */ md_error(rdev->mddev, rdev); } clear_bit(Blocked, &rdev->flags); clear_bit(BlockedBadBlocks, &rdev->flags); wake_up(&rdev->blocked_wait); set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); err = 0; } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { set_bit(In_sync, &rdev->flags); err = 0; } else if (cmd_match(buf, "failfast")) { set_bit(FailFast, &rdev->flags); need_update_sb = true; err = 0; } else if (cmd_match(buf, "-failfast")) { clear_bit(FailFast, &rdev->flags); need_update_sb = true; err = 0; } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags)) { if (rdev->mddev->pers == NULL) { clear_bit(In_sync, &rdev->flags); rdev->saved_raid_disk = rdev->raid_disk; rdev->raid_disk = -1; err = 0; } } else if (cmd_match(buf, "write_error")) { set_bit(WriteErrorSeen, &rdev->flags); err = 0; } else if (cmd_match(buf, "-write_error")) { clear_bit(WriteErrorSeen, &rdev->flags); err = 0; } else if (cmd_match(buf, "want_replacement")) { /* Any non-spare device that is not a replacement can * become want_replacement at any time, but we then need to * check if recovery is needed. */ if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && !test_bit(Replacement, &rdev->flags)) set_bit(WantReplacement, &rdev->flags); set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); err = 0; } else if (cmd_match(buf, "-want_replacement")) { /* Clearing 'want_replacement' is always allowed. * Once replacements starts it is too late though. */ err = 0; clear_bit(WantReplacement, &rdev->flags); } else if (cmd_match(buf, "replacement")) { /* Can only set a device as a replacement when array has not * yet been started. Once running, replacement is automatic * from spares, or by assigning 'slot'. */ if (rdev->mddev->pers) err = -EBUSY; else { set_bit(Replacement, &rdev->flags); err = 0; } } else if (cmd_match(buf, "-replacement")) { /* Similarly, can only clear Replacement before start */ if (rdev->mddev->pers) err = -EBUSY; else { clear_bit(Replacement, &rdev->flags); err = 0; } } else if (cmd_match(buf, "re-add")) { if (!rdev->mddev->pers) err = -EINVAL; else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) && rdev->saved_raid_disk >= 0) { /* clear_bit is performed _after_ all the devices * have their local Faulty bit cleared. If any writes * happen in the meantime in the local node, they * will land in the local bitmap, which will be synced * by this node eventually */ if (!mddev_is_clustered(rdev->mddev) || (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { clear_bit(Faulty, &rdev->flags); err = add_bound_rdev(rdev); } } else err = -EBUSY; } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) { set_bit(ExternalBbl, &rdev->flags); rdev->badblocks.shift = 0; err = 0; } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) { clear_bit(ExternalBbl, &rdev->flags); err = 0; } if (need_update_sb) md_update_sb(mddev, 1); if (!err) sysfs_notify_dirent_safe(rdev->sysfs_state); return err ? err : len; } static struct rdev_sysfs_entry rdev_state = __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store); static ssize_t errors_show(struct md_rdev *rdev, char *page) { return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); } static ssize_t errors_store(struct md_rdev *rdev, const char *buf, size_t len) { unsigned int n; int rv; rv = kstrtouint(buf, 10, &n); if (rv < 0) return rv; atomic_set(&rdev->corrected_errors, n); return len; } static struct rdev_sysfs_entry rdev_errors = __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); static ssize_t slot_show(struct md_rdev *rdev, char *page) { if (test_bit(Journal, &rdev->flags)) return sprintf(page, "journal\n"); else if (rdev->raid_disk < 0) return sprintf(page, "none\n"); else return sprintf(page, "%d\n", rdev->raid_disk); } static ssize_t slot_store(struct md_rdev *rdev, const char *buf, size_t len) { int slot; int err; if (test_bit(Journal, &rdev->flags)) return -EBUSY; if (strncmp(buf, "none", 4)==0) slot = -1; else { err = kstrtouint(buf, 10, (unsigned int *)&slot); if (err < 0) return err; if (slot < 0) /* overflow */ return -ENOSPC; } if (rdev->mddev->pers && slot == -1) { /* Setting 'slot' on an active array requires also * updating the 'rd%d' link, and communicating * with the personality with ->hot_*_disk. * For now we only support removing * failed/spare devices. This normally happens automatically, * but not when the metadata is externally managed. */ if (rdev->raid_disk == -1) return -EEXIST; /* personality does all needed checks */ if (rdev->mddev->pers->hot_remove_disk == NULL) return -EINVAL; clear_bit(Blocked, &rdev->flags); remove_and_add_spares(rdev->mddev, rdev); if (rdev->raid_disk >= 0) return -EBUSY; set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); } else if (rdev->mddev->pers) { /* Activating a spare .. or possibly reactivating * if we ever get bitmaps working here. */ int err; if (rdev->raid_disk != -1) return -EBUSY; if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) return -EBUSY; if (rdev->mddev->pers->hot_add_disk == NULL) return -EINVAL; if (slot >= rdev->mddev->raid_disks && slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) return -ENOSPC; rdev->raid_disk = slot; if (test_bit(In_sync, &rdev->flags)) rdev->saved_raid_disk = slot; else rdev->saved_raid_disk = -1; clear_bit(In_sync, &rdev->flags); clear_bit(Bitmap_sync, &rdev->flags); err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev); if (err) { rdev->raid_disk = -1; return err; } else sysfs_notify_dirent_safe(rdev->sysfs_state); /* failure here is OK */; sysfs_link_rdev(rdev->mddev, rdev); /* don't wakeup anyone, leave that to userspace. */ } else { if (slot >= rdev->mddev->raid_disks && slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) return -ENOSPC; rdev->raid_disk = slot; /* assume it is working */ clear_bit(Faulty, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); set_bit(In_sync, &rdev->flags); sysfs_notify_dirent_safe(rdev->sysfs_state); } return len; } static struct rdev_sysfs_entry rdev_slot = __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); static ssize_t offset_show(struct md_rdev *rdev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); } static ssize_t offset_store(struct md_rdev *rdev, const char *buf, size_t len) { unsigned long long offset; if (kstrtoull(buf, 10, &offset) < 0) return -EINVAL; if (rdev->mddev->pers && rdev->raid_disk >= 0) return -EBUSY; if (rdev->sectors && rdev->mddev->external) /* Must set offset before size, so overlap checks * can be sane */ return -EBUSY; rdev->data_offset = offset; rdev->new_data_offset = offset; return len; } static struct rdev_sysfs_entry rdev_offset = __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); static ssize_t new_offset_show(struct md_rdev *rdev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)rdev->new_data_offset); } static ssize_t new_offset_store(struct md_rdev *rdev, const char *buf, size_t len) { unsigned long long new_offset; struct mddev *mddev = rdev->mddev; if (kstrtoull(buf, 10, &new_offset) < 0) return -EINVAL; if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY; if (new_offset == rdev->data_offset) /* reset is always permitted */ ; else if (new_offset > rdev->data_offset) { /* must not push array size beyond rdev_sectors */ if (new_offset - rdev->data_offset + mddev->dev_sectors > rdev->sectors) return -E2BIG; } /* Metadata worries about other space details. */ /* decreasing the offset is inconsistent with a backwards * reshape. */ if (new_offset < rdev->data_offset && mddev->reshape_backwards) return -EINVAL; /* Increasing offset is inconsistent with forwards * reshape. reshape_direction should be set to * 'backwards' first. */ if (new_offset > rdev->data_offset && !mddev->reshape_backwards) return -EINVAL; if (mddev->pers && mddev->persistent && !super_types[mddev->major_version] .allow_new_offset(rdev, new_offset)) return -E2BIG; rdev->new_data_offset = new_offset; if (new_offset > rdev->data_offset) mddev->reshape_backwards = 1; else if (new_offset < rdev->data_offset) mddev->reshape_backwards = 0; return len; } static struct rdev_sysfs_entry rdev_new_offset = __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); static ssize_t rdev_size_show(struct md_rdev *rdev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); } static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b) { /* check if two start/length pairs overlap */ if (a->data_offset + a->sectors <= b->data_offset) return false; if (b->data_offset + b->sectors <= a->data_offset) return false; return true; } static bool md_rdev_overlaps(struct md_rdev *rdev) { struct mddev *mddev; struct md_rdev *rdev2; spin_lock(&all_mddevs_lock); list_for_each_entry(mddev, &all_mddevs, all_mddevs) { if (test_bit(MD_DELETED, &mddev->flags)) continue; rdev_for_each(rdev2, mddev) { if (rdev != rdev2 && rdev->bdev == rdev2->bdev && md_rdevs_overlap(rdev, rdev2)) { spin_unlock(&all_mddevs_lock); return true; } } } spin_unlock(&all_mddevs_lock); return false; } static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) { unsigned long long blocks; sector_t new; if (kstrtoull(buf, 10, &blocks) < 0) return -EINVAL; if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) return -EINVAL; /* sector conversion overflow */ new = blocks * 2; if (new != blocks * 2) return -EINVAL; /* unsigned long long to sector_t overflow */ *sectors = new; return 0; } static ssize_t rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) { struct mddev *my_mddev = rdev->mddev; sector_t oldsectors = rdev->sectors; sector_t sectors; if (test_bit(Journal, &rdev->flags)) return -EBUSY; if (strict_blocks_to_sectors(buf, §ors) < 0) return -EINVAL; if (rdev->data_offset != rdev->new_data_offset) return -EINVAL; /* too confusing */ if (my_mddev->pers && rdev->raid_disk >= 0) { if (my_mddev->persistent) { sectors = super_types[my_mddev->major_version]. rdev_size_change(rdev, sectors); if (!sectors) return -EBUSY; } else if (!sectors) sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; if (!my_mddev->pers->resize) /* Cannot change size for RAID0 or Linear etc */ return -EINVAL; } if (sectors < my_mddev->dev_sectors) return -EINVAL; /* component must fit device */ rdev->sectors = sectors; /* * Check that all other rdevs with the same bdev do not overlap. This * check does not provide a hard guarantee, it just helps avoid * dangerous mistakes. */ if (sectors > oldsectors && my_mddev->external && md_rdev_overlaps(rdev)) { /* * Someone else could have slipped in a size change here, but * doing so is just silly. We put oldsectors back because we * know it is safe, and trust userspace not to race with itself. */ rdev->sectors = oldsectors; return -EBUSY; } return len; } static struct rdev_sysfs_entry rdev_size = __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) { unsigned long long recovery_start = rdev->recovery_offset; if (test_bit(In_sync, &rdev->flags) || recovery_start == MaxSector) return sprintf(page, "none\n"); return sprintf(page, "%llu\n", recovery_start); } static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) { unsigned long long recovery_start; if (cmd_match(buf, "none")) recovery_start = MaxSector; else if (kstrtoull(buf, 10, &recovery_start)) return -EINVAL; if (rdev->mddev->pers && rdev->raid_disk >= 0) return -EBUSY; rdev->recovery_offset = recovery_start; if (recovery_start == MaxSector) set_bit(In_sync, &rdev->flags); else clear_bit(In_sync, &rdev->flags); return len; } static struct rdev_sysfs_entry rdev_recovery_start = __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); /* sysfs access to bad-blocks list. * We present two files. * 'bad-blocks' lists sector numbers and lengths of ranges that * are recorded as bad. The list is truncated to fit within * the one-page limit of sysfs. * Writing "sector length" to this file adds an acknowledged * bad block list. * 'unacknowledged-bad-blocks' lists bad blocks that have not yet * been acknowledged. Writing to this file adds bad blocks * without acknowledging them. This is largely for testing. */ static ssize_t bb_show(struct md_rdev *rdev, char *page) { return badblocks_show(&rdev->badblocks, page, 0); } static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) { int rv = badblocks_store(&rdev->badblocks, page, len, 0); /* Maybe that ack was all we needed */ if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) wake_up(&rdev->blocked_wait); return rv; } static struct rdev_sysfs_entry rdev_bad_blocks = __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); static ssize_t ubb_show(struct md_rdev *rdev, char *page) { return badblocks_show(&rdev->badblocks, page, 1); } static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) { return badblocks_store(&rdev->badblocks, page, len, 1); } static struct rdev_sysfs_entry rdev_unack_bad_blocks = __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); static ssize_t ppl_sector_show(struct md_rdev *rdev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector); } static ssize_t ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len) { unsigned long long sector; if (kstrtoull(buf, 10, §or) < 0) return -EINVAL; if (sector != (sector_t)sector) return -EINVAL; if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && rdev->raid_disk >= 0) return -EBUSY; if (rdev->mddev->persistent) { if (rdev->mddev->major_version == 0) return -EINVAL; if ((sector > rdev->sb_start && sector - rdev->sb_start > S16_MAX) || (sector < rdev->sb_start && rdev->sb_start - sector > -S16_MIN)) return -EINVAL; rdev->ppl.offset = sector - rdev->sb_start; } else if (!rdev->mddev->external) { return -EBUSY; } rdev->ppl.sector = sector; return len; } static struct rdev_sysfs_entry rdev_ppl_sector = __ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store); static ssize_t ppl_size_show(struct md_rdev *rdev, char *page) { return sprintf(page, "%u\n", rdev->ppl.size); } static ssize_t ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len) { unsigned int size; if (kstrtouint(buf, 10, &size) < 0) return -EINVAL; if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && rdev->raid_disk >= 0) return -EBUSY; if (rdev->mddev->persistent) { if (rdev->mddev->major_version == 0) return -EINVAL; if (size > U16_MAX) return -EINVAL; } else if (!rdev->mddev->external) { return -EBUSY; } rdev->ppl.size = size; return len; } static struct rdev_sysfs_entry rdev_ppl_size = __ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store); static struct attribute *rdev_default_attrs[] = { &rdev_state.attr, &rdev_errors.attr, &rdev_slot.attr, &rdev_offset.attr, &rdev_new_offset.attr, &rdev_size.attr, &rdev_recovery_start.attr, &rdev_bad_blocks.attr, &rdev_unack_bad_blocks.attr, &rdev_ppl_sector.attr, &rdev_ppl_size.attr, NULL, }; ATTRIBUTE_GROUPS(rdev_default); static ssize_t rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); if (!entry->show) return -EIO; if (!rdev->mddev) return -ENODEV; return entry->show(rdev, page); } static ssize_t rdev_attr_store(struct kobject *kobj, struct attribute *attr, const char *page, size_t length) { struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); struct kernfs_node *kn = NULL; bool suspend = false; ssize_t rv; struct mddev *mddev = READ_ONCE(rdev->mddev); if (!entry->store) return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (!mddev) return -ENODEV; if (entry->store == state_store) { if (cmd_match(page, "remove")) kn = sysfs_break_active_protection(kobj, attr); if (cmd_match(page, "remove") || cmd_match(page, "re-add") || cmd_match(page, "writemostly") || cmd_match(page, "-writemostly")) suspend = true; } rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev); if (!rv) { if (rdev->mddev == NULL) rv = -ENODEV; else rv = entry->store(rdev, page, length); suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); } if (kn) sysfs_unbreak_active_protection(kn); return rv; } static void rdev_free(struct kobject *ko) { struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); kfree(rdev); } static const struct sysfs_ops rdev_sysfs_ops = { .show = rdev_attr_show, .store = rdev_attr_store, }; static const struct kobj_type rdev_ktype = { .release = rdev_free, .sysfs_ops = &rdev_sysfs_ops, .default_groups = rdev_default_groups, }; int md_rdev_init(struct md_rdev *rdev) { rdev->desc_nr = -1; rdev->saved_raid_disk = -1; rdev->raid_disk = -1; rdev->flags = 0; rdev->data_offset = 0; rdev->new_data_offset = 0; rdev->sb_events = 0; rdev->last_read_error = 0; rdev->sb_loaded = 0; rdev->bb_page = NULL; atomic_set(&rdev->nr_pending, 0); atomic_set(&rdev->read_errors, 0); atomic_set(&rdev->corrected_errors, 0); INIT_LIST_HEAD(&rdev->same_set); init_waitqueue_head(&rdev->blocked_wait); /* Add space to store bad block list. * This reserves the space even on arrays where it cannot * be used - I wonder if that matters */ return badblocks_init(&rdev->badblocks, 0); } EXPORT_SYMBOL_GPL(md_rdev_init); /* * Import a device. If 'super_format' >= 0, then sanity check the superblock * * mark the device faulty if: * * - the device is nonexistent (zero size) * - the device has no valid superblock * * a faulty rdev _never_ has rdev->sb set. */ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) { struct md_rdev *rdev; sector_t size; int err; rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); if (!rdev) return ERR_PTR(-ENOMEM); err = md_rdev_init(rdev); if (err) goto out_free_rdev; err = alloc_disk_sb(rdev); if (err) goto out_clear_rdev; rdev->bdev_file = bdev_file_open_by_dev(newdev, BLK_OPEN_READ | BLK_OPEN_WRITE, super_format == -2 ? &claim_rdev : rdev, NULL); if (IS_ERR(rdev->bdev_file)) { pr_warn("md: could not open device unknown-block(%u,%u).\n", MAJOR(newdev), MINOR(newdev)); err = PTR_ERR(rdev->bdev_file); goto out_clear_rdev; } rdev->bdev = file_bdev(rdev->bdev_file); kobject_init(&rdev->kobj, &rdev_ktype); size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS; if (!size) { pr_warn("md: %pg has zero or unknown size, marking faulty!\n", rdev->bdev); err = -EINVAL; goto out_blkdev_put; } if (super_format >= 0) { err = super_types[super_format]. load_super(rdev, NULL, super_minor); if (err == -EINVAL) { pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n", rdev->bdev, super_format, super_minor); goto out_blkdev_put; } if (err < 0) { pr_warn("md: could not read %pg's sb, not importing!\n", rdev->bdev); goto out_blkdev_put; } } return rdev; out_blkdev_put: fput(rdev->bdev_file); out_clear_rdev: md_rdev_clear(rdev); out_free_rdev: kfree(rdev); return ERR_PTR(err); } /* * Check a full RAID array for plausibility */ static int analyze_sbs(struct mddev *mddev) { int i; struct md_rdev *rdev, *freshest, *tmp; freshest = NULL; rdev_for_each_safe(rdev, tmp, mddev) switch (super_types[mddev->major_version]. load_super(rdev, freshest, mddev->minor_version)) { case 1: freshest = rdev; break; case 0: break; default: pr_warn("md: fatal superblock inconsistency in %pg -- removing from array\n", rdev->bdev); md_kick_rdev_from_array(rdev); } /* Cannot find a valid fresh disk */ if (!freshest) { pr_warn("md: cannot find a valid disk\n"); return -EINVAL; } super_types[mddev->major_version]. validate_super(mddev, NULL/*freshest*/, freshest); i = 0; rdev_for_each_safe(rdev, tmp, mddev) { if (mddev->max_disks && (rdev->desc_nr >= mddev->max_disks || i > mddev->max_disks)) { pr_warn("md: %s: %pg: only %d devices permitted\n", mdname(mddev), rdev->bdev, mddev->max_disks); md_kick_rdev_from_array(rdev); continue; } if (rdev != freshest) { if (super_types[mddev->major_version]. validate_super(mddev, freshest, rdev)) { pr_warn("md: kicking non-fresh %pg from array!\n", rdev->bdev); md_kick_rdev_from_array(rdev); continue; } } if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks)) && !test_bit(Journal, &rdev->flags)) { rdev->raid_disk = -1; clear_bit(In_sync, &rdev->flags); } } return 0; } /* Read a fixed-point number. * Numbers in sysfs attributes should be in "standard" units where * possible, so time should be in seconds. * However we internally use a a much smaller unit such as * milliseconds or jiffies. * This function takes a decimal number with a possible fractional * component, and produces an integer which is the result of * multiplying that number by 10^'scale'. * all without any floating-point arithmetic. */ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) { unsigned long result = 0; long decimals = -1; while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { if (*cp == '.') decimals = 0; else if (decimals < scale) { unsigned int value; value = *cp - '0'; result = result * 10 + value; if (decimals >= 0) decimals++; } cp++; } if (*cp == '\n') cp++; if (*cp) return -EINVAL; if (decimals < 0) decimals = 0; *res = result * int_pow(10, scale - decimals); return 0; } static ssize_t safe_delay_show(struct mddev *mddev, char *page) { unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ; return sprintf(page, "%u.%03u\n", msec/1000, msec%1000); } static ssize_t safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) { unsigned long msec; if (mddev_is_clustered(mddev)) { pr_warn("md: Safemode is disabled for clustered mode\n"); return -EINVAL; } if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ) return -EINVAL; if (msec == 0) mddev->safemode_delay = 0; else { unsigned long old_delay = mddev->safemode_delay; unsigned long new_delay = (msec*HZ)/1000; if (new_delay == 0) new_delay = 1; mddev->safemode_delay = new_delay; if (new_delay < old_delay || old_delay == 0) mod_timer(&mddev->safemode_timer, jiffies+1); } return len; } static struct md_sysfs_entry md_safe_delay = __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); static ssize_t level_show(struct mddev *mddev, char *page) { struct md_personality *p; int ret; spin_lock(&mddev->lock); p = mddev->pers; if (p) ret = sprintf(page, "%s\n", p->name); else if (mddev->clevel[0]) ret = sprintf(page, "%s\n", mddev->clevel); else if (mddev->level != LEVEL_NONE) ret = sprintf(page, "%d\n", mddev->level); else ret = 0; spin_unlock(&mddev->lock); return ret; } static ssize_t level_store(struct mddev *mddev, const char *buf, size_t len) { char clevel[16]; ssize_t rv; size_t slen = len; struct md_personality *pers, *oldpers; long level; void *priv, *oldpriv; struct md_rdev *rdev; if (slen == 0 || slen >= sizeof(clevel)) return -EINVAL; rv = mddev_suspend_and_lock(mddev); if (rv) return rv; if (mddev->pers == NULL) { memcpy(mddev->clevel, buf, slen); if (mddev->clevel[slen-1] == '\n') slen--; mddev->clevel[slen] = 0; mddev->level = LEVEL_NONE; rv = len; goto out_unlock; } rv = -EROFS; if (!md_is_rdwr(mddev)) goto out_unlock; /* request to change the personality. Need to ensure: * - array is not engaged in resync/recovery/reshape * - old personality can be suspended * - new personality will access other array. */ rv = -EBUSY; if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || mddev->reshape_position != MaxSector || mddev->sysfs_active) goto out_unlock; rv = -EINVAL; if (!mddev->pers->quiesce) { pr_warn("md: %s: %s does not support online personality change\n", mdname(mddev), mddev->pers->name); goto out_unlock; } /* Now find the new personality */ memcpy(clevel, buf, slen); if (clevel[slen-1] == '\n') slen--; clevel[slen] = 0; if (kstrtol(clevel, 10, &level)) level = LEVEL_NONE; if (request_module("md-%s", clevel) != 0) request_module("md-level-%s", clevel); spin_lock(&pers_lock); pers = find_pers(level, clevel); if (!pers || !try_module_get(pers->owner)) { spin_unlock(&pers_lock); pr_warn("md: personality %s not loaded\n", clevel); rv = -EINVAL; goto out_unlock; } spin_unlock(&pers_lock); if (pers == mddev->pers) { /* Nothing to do! */ module_put(pers->owner); rv = len; goto out_unlock; } if (!pers->takeover) { module_put(pers->owner); pr_warn("md: %s: %s does not support personality takeover\n", mdname(mddev), clevel); rv = -EINVAL; goto out_unlock; } rdev_for_each(rdev, mddev) rdev->new_raid_disk = rdev->raid_disk; /* ->takeover must set new_* and/or delta_disks * if it succeeds, and may set them when it fails. */ priv = pers->takeover(mddev); if (IS_ERR(priv)) { mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->raid_disks -= mddev->delta_disks; mddev->delta_disks = 0; mddev->reshape_backwards = 0; module_put(pers->owner); pr_warn("md: %s: %s would not accept array\n", mdname(mddev), clevel); rv = PTR_ERR(priv); goto out_unlock; } /* Looks like we have a winner */ mddev_detach(mddev); spin_lock(&mddev->lock); oldpers = mddev->pers; oldpriv = mddev->private; mddev->pers = pers; mddev->private = priv; strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); mddev->level = mddev->new_level; mddev->layout = mddev->new_layout; mddev->chunk_sectors = mddev->new_chunk_sectors; mddev->delta_disks = 0; mddev->reshape_backwards = 0; mddev->degraded = 0; spin_unlock(&mddev->lock); if (oldpers->sync_request == NULL && mddev->external) { /* We are converting from a no-redundancy array * to a redundancy array and metadata is managed * externally so we need to be sure that writes * won't block due to a need to transition * clean->dirty * until external management is started. */ mddev->in_sync = 0; mddev->safemode_delay = 0; mddev->safemode = 0; } oldpers->free(mddev, oldpriv); if (oldpers->sync_request == NULL && pers->sync_request != NULL) { /* need to add the md_redundancy_group */ if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) pr_warn("md: cannot register extra attributes for %s\n", mdname(mddev)); mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); } if (oldpers->sync_request != NULL && pers->sync_request == NULL) { /* need to remove the md_redundancy_group */ if (mddev->to_remove == NULL) mddev->to_remove = &md_redundancy_group; } module_put(oldpers->owner); rdev_for_each(rdev, mddev) { if (rdev->raid_disk < 0) continue; if (rdev->new_raid_disk >= mddev->raid_disks) rdev->new_raid_disk = -1; if (rdev->new_raid_disk == rdev->raid_disk) continue; sysfs_unlink_rdev(mddev, rdev); } rdev_for_each(rdev, mddev) { if (rdev->raid_disk < 0) continue; if (rdev->new_raid_disk == rdev->raid_disk) continue; rdev->raid_disk = rdev->new_raid_disk; if (rdev->raid_disk < 0) clear_bit(In_sync, &rdev->flags); else { if (sysfs_link_rdev(mddev, rdev)) pr_warn("md: cannot register rd%d for %s after level change\n", rdev->raid_disk, mdname(mddev)); } } if (pers->sync_request == NULL) { /* this is now an array without redundancy, so * it must always be in_sync */ mddev->in_sync = 1; del_timer_sync(&mddev->safemode_timer); } pers->run(mddev); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); if (!mddev->thread) md_update_sb(mddev, 1); sysfs_notify_dirent_safe(mddev->sysfs_level); md_new_event(); rv = len; out_unlock: mddev_unlock_and_resume(mddev); return rv; } static struct md_sysfs_entry md_level = __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); static ssize_t layout_show(struct mddev *mddev, char *page) { /* just a number, not meaningful for all levels */ if (mddev->reshape_position != MaxSector && mddev->layout != mddev->new_layout) return sprintf(page, "%d (%d)\n", mddev->new_layout, mddev->layout); return sprintf(page, "%d\n", mddev->layout); } static ssize_t layout_store(struct mddev *mddev, const char *buf, size_t len) { unsigned int n; int err; err = kstrtouint(buf, 10, &n); if (err < 0) return err; err = mddev_lock(mddev); if (err) return err; if (mddev->pers) { if (mddev->pers->check_reshape == NULL) err = -EBUSY; else if (!md_is_rdwr(mddev)) err = -EROFS; else { mddev->new_layout = n; err = mddev->pers->check_reshape(mddev); if (err) mddev->new_layout = mddev->layout; } } else { mddev->new_layout = n; if (mddev->reshape_position == MaxSector) mddev->layout = n; } mddev_unlock(mddev); return err ?: len; } static struct md_sysfs_entry md_layout = __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); static ssize_t raid_disks_show(struct mddev *mddev, char *page) { if (mddev->raid_disks == 0) return 0; if (mddev->reshape_position != MaxSector && mddev->delta_disks != 0) return sprintf(page, "%d (%d)\n", mddev->raid_disks, mddev->raid_disks - mddev->delta_disks); return sprintf(page, "%d\n", mddev->raid_disks); } static int update_raid_disks(struct mddev *mddev, int raid_disks); static ssize_t raid_disks_store(struct mddev *mddev, const char *buf, size_t len) { unsigned int n; int err; err = kstrtouint(buf, 10, &n); if (err < 0) return err; err = mddev_lock(mddev); if (err) return err; if (mddev->pers) err = update_raid_disks(mddev, n); else if (mddev->reshape_position != MaxSector) { struct md_rdev *rdev; int olddisks = mddev->raid_disks - mddev->delta_disks; err = -EINVAL; rdev_for_each(rdev, mddev) { if (olddisks < n && rdev->data_offset < rdev->new_data_offset) goto out_unlock; if (olddisks > n && rdev->data_offset > rdev->new_data_offset) goto out_unlock; } err = 0; mddev->delta_disks = n - olddisks; mddev->raid_disks = n; mddev->reshape_backwards = (mddev->delta_disks < 0); } else mddev->raid_disks = n; out_unlock: mddev_unlock(mddev); return err ? err : len; } static struct md_sysfs_entry md_raid_disks = __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); static ssize_t uuid_show(struct mddev *mddev, char *page) { return sprintf(page, "%pU\n", mddev->uuid); } static struct md_sysfs_entry md_uuid = __ATTR(uuid, S_IRUGO, uuid_show, NULL); static ssize_t chunk_size_show(struct mddev *mddev, char *page) { if (mddev->reshape_position != MaxSector && mddev->chunk_sectors != mddev->new_chunk_sectors) return sprintf(page, "%d (%d)\n", mddev->new_chunk_sectors << 9, mddev->chunk_sectors << 9); return sprintf(page, "%d\n", mddev->chunk_sectors << 9); } static ssize_t chunk_size_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long n; int err; err = kstrtoul(buf, 10, &n); if (err < 0) return err; err = mddev_lock(mddev); if (err) return err; if (mddev->pers) { if (mddev->pers->check_reshape == NULL) err = -EBUSY; else if (!md_is_rdwr(mddev)) err = -EROFS; else { mddev->new_chunk_sectors = n >> 9; err = mddev->pers->check_reshape(mddev); if (err) mddev->new_chunk_sectors = mddev->chunk_sectors; } } else { mddev->new_chunk_sectors = n >> 9; if (mddev->reshape_position == MaxSector) mddev->chunk_sectors = n >> 9; } mddev_unlock(mddev); return err ?: len; } static struct md_sysfs_entry md_chunk_size = __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); static ssize_t resync_start_show(struct mddev *mddev, char *page) { if (mddev->recovery_cp == MaxSector) return sprintf(page, "none\n"); return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); } static ssize_t resync_start_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long long n; int err; if (cmd_match(buf, "none")) n = MaxSector; else { err = kstrtoull(buf, 10, &n); if (err < 0) return err; if (n != (sector_t)n) return -EINVAL; } err = mddev_lock(mddev); if (err) return err; if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) err = -EBUSY; if (!err) { mddev->recovery_cp = n; if (mddev->pers) set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); } mddev_unlock(mddev); return err ?: len; } static struct md_sysfs_entry md_resync_start = __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); /* * The array state can be: * * clear * No devices, no size, no level * Equivalent to STOP_ARRAY ioctl * inactive * May have some settings, but array is not active * all IO results in error * When written, doesn't tear down array, but just stops it * suspended (not supported yet) * All IO requests will block. The array can be reconfigured. * Writing this, if accepted, will block until array is quiescent * readonly * no resync can happen. no superblocks get written. * write requests fail * read-auto * like readonly, but behaves like 'clean' on a write request. * * clean - no pending writes, but otherwise active. * When written to inactive array, starts without resync * If a write request arrives then * if metadata is known, mark 'dirty' and switch to 'active'. * if not known, block and switch to write-pending * If written to an active array that has pending writes, then fails. * active * fully active: IO and resync can be happening. * When written to inactive array, starts with resync * * write-pending * clean, but writes are blocked waiting for 'active' to be written. * * active-idle * like active, but no writes have been seen for a while (100msec). * * broken * Array is failed. It's useful because mounted-arrays aren't stopped * when array is failed, so this state will at least alert the user that * something is wrong. */ enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, write_pending, active_idle, broken, bad_word}; static char *array_states[] = { "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", "write-pending", "active-idle", "broken", NULL }; static int match_word(const char *word, char **list) { int n; for (n=0; list[n]; n++) if (cmd_match(word, list[n])) break; return n; } static ssize_t array_state_show(struct mddev *mddev, char *page) { enum array_state st = inactive; if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) { switch(mddev->ro) { case MD_RDONLY: st = readonly; break; case MD_AUTO_READ: st = read_auto; break; case MD_RDWR: spin_lock(&mddev->lock); if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) st = write_pending; else if (mddev->in_sync) st = clean; else if (mddev->safemode) st = active_idle; else st = active; spin_unlock(&mddev->lock); } if (test_bit(MD_BROKEN, &mddev->flags) && st == clean) st = broken; } else { if (list_empty(&mddev->disks) && mddev->raid_disks == 0 && mddev->dev_sectors == 0) st = clear; else st = inactive; } return sprintf(page, "%s\n", array_states[st]); } static int do_md_stop(struct mddev *mddev, int ro); static int md_set_readonly(struct mddev *mddev); static int restart_array(struct mddev *mddev); static ssize_t array_state_store(struct mddev *mddev, const char *buf, size_t len) { int err = 0; enum array_state st = match_word(buf, array_states); /* No lock dependent actions */ switch (st) { case suspended: /* not supported yet */ case write_pending: /* cannot be set */ case active_idle: /* cannot be set */ case broken: /* cannot be set */ case bad_word: return -EINVAL; case clear: case readonly: case inactive: case read_auto: if (!mddev->pers || !md_is_rdwr(mddev)) break; /* write sysfs will not open mddev and opener should be 0 */ err = mddev_set_closing_and_sync_blockdev(mddev, 0); if (err) return err; break; default: break; } if (mddev->pers && (st == active || st == clean) && mddev->ro != MD_RDONLY) { /* don't take reconfig_mutex when toggling between * clean and active */ spin_lock(&mddev->lock); if (st == active) { restart_array(mddev); clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); md_wakeup_thread(mddev->thread); wake_up(&mddev->sb_wait); } else /* st == clean */ { restart_array(mddev); if (!set_in_sync(mddev)) err = -EBUSY; } if (!err) sysfs_notify_dirent_safe(mddev->sysfs_state); spin_unlock(&mddev->lock); return err ?: len; } err = mddev_lock(mddev); if (err) return err; switch (st) { case inactive: /* stop an active array, return 0 otherwise */ if (mddev->pers) err = do_md_stop(mddev, 2); break; case clear: err = do_md_stop(mddev, 0); break; case readonly: if (mddev->pers) err = md_set_readonly(mddev); else { mddev->ro = MD_RDONLY; set_disk_ro(mddev->gendisk, 1); err = do_md_run(mddev); } break; case read_auto: if (mddev->pers) { if (md_is_rdwr(mddev)) err = md_set_readonly(mddev); else if (mddev->ro == MD_RDONLY) err = restart_array(mddev); if (err == 0) { mddev->ro = MD_AUTO_READ; set_disk_ro(mddev->gendisk, 0); } } else { mddev->ro = MD_AUTO_READ; err = do_md_run(mddev); } break; case clean: if (mddev->pers) { err = restart_array(mddev); if (err) break; spin_lock(&mddev->lock); if (!set_in_sync(mddev)) err = -EBUSY; spin_unlock(&mddev->lock); } else err = -EINVAL; break; case active: if (mddev->pers) { err = restart_array(mddev); if (err) break; clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); wake_up(&mddev->sb_wait); err = 0; } else { mddev->ro = MD_RDWR; set_disk_ro(mddev->gendisk, 0); err = do_md_run(mddev); } break; default: err = -EINVAL; break; } if (!err) { if (mddev->hold_active == UNTIL_IOCTL) mddev->hold_active = 0; sysfs_notify_dirent_safe(mddev->sysfs_state); } mddev_unlock(mddev); if (st == readonly || st == read_auto || st == inactive || (err && st == clear)) clear_bit(MD_CLOSING, &mddev->flags); return err ?: len; } static struct md_sysfs_entry md_array_state = __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); static ssize_t max_corrected_read_errors_show(struct mddev *mddev, char *page) { return sprintf(page, "%d\n", atomic_read(&mddev->max_corr_read_errors)); } static ssize_t max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) { unsigned int n; int rv; rv = kstrtouint(buf, 10, &n); if (rv < 0) return rv; if (n > INT_MAX) return -EINVAL; atomic_set(&mddev->max_corr_read_errors, n); return len; } static struct md_sysfs_entry max_corr_read_errors = __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, max_corrected_read_errors_store); static ssize_t null_show(struct mddev *mddev, char *page) { return -EINVAL; } static ssize_t new_dev_store(struct mddev *mddev, const char *buf, size_t len) { /* buf must be %d:%d\n? giving major and minor numbers */ /* The new device is added to the array. * If the array has a persistent superblock, we read the * superblock to initialise info and check validity. * Otherwise, only checking done is that in bind_rdev_to_array, * which mainly checks size. */ char *e; int major = simple_strtoul(buf, &e, 10); int minor; dev_t dev; struct md_rdev *rdev; int err; if (!*buf || *e != ':' || !e[1] || e[1] == '\n') return -EINVAL; minor = simple_strtoul(e+1, &e, 10); if (*e && *e != '\n') return -EINVAL; dev = MKDEV(major, minor); if (major != MAJOR(dev) || minor != MINOR(dev)) return -EOVERFLOW; err = mddev_suspend_and_lock(mddev); if (err) return err; if (mddev->persistent) { rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { struct md_rdev *rdev0 = list_entry(mddev->disks.next, struct md_rdev, same_set); err = super_types[mddev->major_version] .load_super(rdev, rdev0, mddev->minor_version); if (err < 0) goto out; } } else if (mddev->external) rdev = md_import_device(dev, -2, -1); else rdev = md_import_device(dev, -1, -1); if (IS_ERR(rdev)) { mddev_unlock_and_resume(mddev); return PTR_ERR(rdev); } err = bind_rdev_to_array(rdev, mddev); out: if (err) export_rdev(rdev, mddev); mddev_unlock_and_resume(mddev); if (!err) md_new_event(); return err ? err : len; } static struct md_sysfs_entry md_new_device = __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); static ssize_t bitmap_store(struct mddev *mddev, const char *buf, size_t len) { char *end; unsigned long chunk, end_chunk; int err; err = mddev_lock(mddev); if (err) return err; if (!mddev->bitmap) goto out; /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ while (*buf) { chunk = end_chunk = simple_strtoul(buf, &end, 0); if (buf == end) break; if (*end == '-') { /* range */ buf = end + 1; end_chunk = simple_strtoul(buf, &end, 0); if (buf == end) break; } if (*end && !isspace(*end)) break; md_bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); buf = skip_spaces(end); } md_bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ out: mddev_unlock(mddev); return len; } static struct md_sysfs_entry md_bitmap = __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); static ssize_t size_show(struct mddev *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)mddev->dev_sectors / 2); } static int update_size(struct mddev *mddev, sector_t num_sectors); static ssize_t size_store(struct mddev *mddev, const char *buf, size_t len) { /* If array is inactive, we can reduce the component size, but * not increase it (except from 0). * If array is active, we can try an on-line resize */ sector_t sectors; int err = strict_blocks_to_sectors(buf, §ors); if (err < 0) return err; err = mddev_lock(mddev); if (err) return err; if (mddev->pers) { err = update_size(mddev, sectors); if (err == 0) md_update_sb(mddev, 1); } else { if (mddev->dev_sectors == 0 || mddev->dev_sectors > sectors) mddev->dev_sectors = sectors; else err = -ENOSPC; } mddev_unlock(mddev); return err ? err : len; } static struct md_sysfs_entry md_size = __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); /* Metadata version. * This is one of * 'none' for arrays with no metadata (good luck...) * 'external' for arrays with externally managed metadata, * or N.M for internally known formats */ static ssize_t metadata_show(struct mddev *mddev, char *page) { if (mddev->persistent) return sprintf(page, "%d.%d\n", mddev->major_version, mddev->minor_version); else if (mddev->external) return sprintf(page, "external:%s\n", mddev->metadata_type); else return sprintf(page, "none\n"); } static ssize_t metadata_store(struct mddev *mddev, const char *buf, size_t len) { int major, minor; char *e; int err; /* Changing the details of 'external' metadata is * always permitted. Otherwise there must be * no devices attached to the array. */ err = mddev_lock(mddev); if (err) return err; err = -EBUSY; if (mddev->external && strncmp(buf, "external:", 9) == 0) ; else if (!list_empty(&mddev->disks)) goto out_unlock; err = 0; if (cmd_match(buf, "none")) { mddev->persistent = 0; mddev->external = 0; mddev->major_version = 0; mddev->minor_version = 90; goto out_unlock; } if (strncmp(buf, "external:", 9) == 0) { size_t namelen = len-9; if (namelen >= sizeof(mddev->metadata_type)) namelen = sizeof(mddev->metadata_type)-1; memcpy(mddev->metadata_type, buf+9, namelen); mddev->metadata_type[namelen] = 0; if (namelen && mddev->metadata_type[namelen-1] == '\n') mddev->metadata_type[--namelen] = 0; mddev->persistent = 0; mddev->external = 1; mddev->major_version = 0; mddev->minor_version = 90; goto out_unlock; } major = simple_strtoul(buf, &e, 10); err = -EINVAL; if (e==buf || *e != '.') goto out_unlock; buf = e+1; minor = simple_strtoul(buf, &e, 10); if (e==buf || (*e && *e != '\n') ) goto out_unlock; err = -ENOENT; if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) goto out_unlock; mddev->major_version = major; mddev->minor_version = minor; mddev->persistent = 1; mddev->external = 0; err = 0; out_unlock: mddev_unlock(mddev); return err ?: len; } static struct md_sysfs_entry md_metadata = __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); static ssize_t action_show(struct mddev *mddev, char *page) { char *type = "idle"; unsigned long recovery = mddev->recovery; if (test_bit(MD_RECOVERY_FROZEN, &recovery)) type = "frozen"; else if (test_bit(MD_RECOVERY_RUNNING, &recovery) || (md_is_rdwr(mddev) && test_bit(MD_RECOVERY_NEEDED, &recovery))) { if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) type = "reshape"; else if (test_bit(MD_RECOVERY_SYNC, &recovery)) { if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) type = "resync"; else if (test_bit(MD_RECOVERY_CHECK, &recovery)) type = "check"; else type = "repair"; } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) type = "recover"; else if (mddev->reshape_position != MaxSector) type = "reshape"; } return sprintf(page, "%s\n", type); } /** * stop_sync_thread() - wait for sync_thread to stop if it's running. * @mddev: the array. * @locked: if set, reconfig_mutex will still be held after this function * return; if not set, reconfig_mutex will be released after this * function return. * @check_seq: if set, only wait for curent running sync_thread to stop, noted * that new sync_thread can still start. */ static void stop_sync_thread(struct mddev *mddev, bool locked, bool check_seq) { int sync_seq; if (check_seq) sync_seq = atomic_read(&mddev->sync_seq); if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { if (!locked) mddev_unlock(mddev); return; } mddev_unlock(mddev); set_bit(MD_RECOVERY_INTR, &mddev->recovery); /* * Thread might be blocked waiting for metadata update which will now * never happen */ md_wakeup_thread_directly(mddev->sync_thread); if (work_pending(&mddev->sync_work)) flush_work(&mddev->sync_work); wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || (check_seq && sync_seq != atomic_read(&mddev->sync_seq))); if (locked) mddev_lock_nointr(mddev); } void md_idle_sync_thread(struct mddev *mddev) { lockdep_assert_held(&mddev->reconfig_mutex); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); stop_sync_thread(mddev, true, true); } EXPORT_SYMBOL_GPL(md_idle_sync_thread); void md_frozen_sync_thread(struct mddev *mddev) { lockdep_assert_held(&mddev->reconfig_mutex); set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); stop_sync_thread(mddev, true, false); } EXPORT_SYMBOL_GPL(md_frozen_sync_thread); void md_unfrozen_sync_thread(struct mddev *mddev) { lockdep_assert_held(&mddev->reconfig_mutex); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); sysfs_notify_dirent_safe(mddev->sysfs_action); } EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread); static void idle_sync_thread(struct mddev *mddev) { mutex_lock(&mddev->sync_mutex); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); if (mddev_lock(mddev)) { mutex_unlock(&mddev->sync_mutex); return; } stop_sync_thread(mddev, false, true); mutex_unlock(&mddev->sync_mutex); } static void frozen_sync_thread(struct mddev *mddev) { mutex_lock(&mddev->sync_mutex); set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); if (mddev_lock(mddev)) { mutex_unlock(&mddev->sync_mutex); return; } stop_sync_thread(mddev, false, false); mutex_unlock(&mddev->sync_mutex); } static ssize_t action_store(struct mddev *mddev, const char *page, size_t len) { if (!mddev->pers || !mddev->pers->sync_request) return -EINVAL; if (cmd_match(page, "idle")) idle_sync_thread(mddev); else if (cmd_match(page, "frozen")) frozen_sync_thread(mddev); else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY; else if (cmd_match(page, "resync")) clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); else if (cmd_match(page, "recover")) { clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); } else if (cmd_match(page, "reshape")) { int err; if (mddev->pers->start_reshape == NULL) return -EINVAL; err = mddev_lock(mddev); if (!err) { if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { err = -EBUSY; } else if (mddev->reshape_position == MaxSector || mddev->pers->check_reshape == NULL || mddev->pers->check_reshape(mddev)) { clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); err = mddev->pers->start_reshape(mddev); } else { /* * If reshape is still in progress, and * md_check_recovery() can continue to reshape, * don't restart reshape because data can be * corrupted for raid456. */ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } mddev_unlock(mddev); } if (err) return err; sysfs_notify_dirent_safe(mddev->sysfs_degraded); } else { if (cmd_match(page, "check")) set_bit(MD_RECOVERY_CHECK, &mddev->recovery); else if (!cmd_match(page, "repair")) return -EINVAL; clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); set_bit(MD_RECOVERY_SYNC, &mddev->recovery); } if (mddev->ro == MD_AUTO_READ) { /* A write to sync_action is enough to justify * canceling read-auto mode */ flush_work(&mddev->sync_work); mddev->ro = MD_RDWR; md_wakeup_thread(mddev->sync_thread); } set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); sysfs_notify_dirent_safe(mddev->sysfs_action); return len; } static struct md_sysfs_entry md_scan_mode = __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); static ssize_t last_sync_action_show(struct mddev *mddev, char *page) { return sprintf(page, "%s\n", mddev->last_sync_action); } static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); static ssize_t mismatch_cnt_show(struct mddev *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long) atomic64_read(&mddev->resync_mismatches)); } static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); static ssize_t sync_min_show(struct mddev *mddev, char *page) { return sprintf(page, "%d (%s)\n", speed_min(mddev), mddev->sync_speed_min ? "local": "system"); } static ssize_t sync_min_store(struct mddev *mddev, const char *buf, size_t len) { unsigned int min; int rv; if (strncmp(buf, "system", 6)==0) { min = 0; } else { rv = kstrtouint(buf, 10, &min); if (rv < 0) return rv; if (min == 0) return -EINVAL; } mddev->sync_speed_min = min; return len; } static struct md_sysfs_entry md_sync_min = __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); static ssize_t sync_max_show(struct mddev *mddev, char *page) { return sprintf(page, "%d (%s)\n", speed_max(mddev), mddev->sync_speed_max ? "local": "system"); } static ssize_t sync_max_store(struct mddev *mddev, const char *buf, size_t len) { unsigned int max; int rv; if (strncmp(buf, "system", 6)==0) { max = 0; } else { rv = kstrtouint(buf, 10, &max); if (rv < 0) return rv; if (max == 0) return -EINVAL; } mddev->sync_speed_max = max; return len; } static struct md_sysfs_entry md_sync_max = __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); static ssize_t degraded_show(struct mddev *mddev, char *page) { return sprintf(page, "%d\n", mddev->degraded); } static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); static ssize_t sync_force_parallel_show(struct mddev *mddev, char *page) { return sprintf(page, "%d\n", mddev->parallel_resync); } static ssize_t sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) { long n; if (kstrtol(buf, 10, &n)) return -EINVAL; if (n != 0 && n != 1) return -EINVAL; mddev->parallel_resync = n; if (mddev->sync_thread) wake_up(&resync_wait); return len; } /* force parallel resync, even with shared block devices */ static struct md_sysfs_entry md_sync_force_parallel = __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, sync_force_parallel_show, sync_force_parallel_store); static ssize_t sync_speed_show(struct mddev *mddev, char *page) { unsigned long resync, dt, db; if (mddev->curr_resync == MD_RESYNC_NONE) return sprintf(page, "none\n"); resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); dt = (jiffies - mddev->resync_mark) / HZ; if (!dt) dt++; db = resync - mddev->resync_mark_cnt; return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ } static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); static ssize_t sync_completed_show(struct mddev *mddev, char *page) { unsigned long long max_sectors, resync; if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return sprintf(page, "none\n"); if (mddev->curr_resync == MD_RESYNC_YIELDED || mddev->curr_resync == MD_RESYNC_DELAYED) return sprintf(page, "delayed\n"); if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) max_sectors = mddev->resync_max_sectors; else max_sectors = mddev->dev_sectors; resync = mddev->curr_resync_completed; return sprintf(page, "%llu / %llu\n", resync, max_sectors); } static struct md_sysfs_entry md_sync_completed = __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL); static ssize_t min_sync_show(struct mddev *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)mddev->resync_min); } static ssize_t min_sync_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long long min; int err; if (kstrtoull(buf, 10, &min)) return -EINVAL; spin_lock(&mddev->lock); err = -EINVAL; if (min > mddev->resync_max) goto out_unlock; err = -EBUSY; if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) goto out_unlock; /* Round down to multiple of 4K for safety */ mddev->resync_min = round_down(min, 8); err = 0; out_unlock: spin_unlock(&mddev->lock); return err ?: len; } static struct md_sysfs_entry md_min_sync = __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); static ssize_t max_sync_show(struct mddev *mddev, char *page) { if (mddev->resync_max == MaxSector) return sprintf(page, "max\n"); else return sprintf(page, "%llu\n", (unsigned long long)mddev->resync_max); } static ssize_t max_sync_store(struct mddev *mddev, const char *buf, size_t len) { int err; spin_lock(&mddev->lock); if (strncmp(buf, "max", 3) == 0) mddev->resync_max = MaxSector; else { unsigned long long max; int chunk; err = -EINVAL; if (kstrtoull(buf, 10, &max)) goto out_unlock; if (max < mddev->resync_min) goto out_unlock; err = -EBUSY; if (max < mddev->resync_max && md_is_rdwr(mddev) && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) goto out_unlock; /* Must be a multiple of chunk_size */ chunk = mddev->chunk_sectors; if (chunk) { sector_t temp = max; err = -EINVAL; if (sector_div(temp, chunk)) goto out_unlock; } mddev->resync_max = max; } wake_up(&mddev->recovery_wait); err = 0; out_unlock: spin_unlock(&mddev->lock); return err ?: len; } static struct md_sysfs_entry md_max_sync = __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); static ssize_t suspend_lo_show(struct mddev *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)READ_ONCE(mddev->suspend_lo)); } static ssize_t suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long long new; int err; err = kstrtoull(buf, 10, &new); if (err < 0) return err; if (new != (sector_t)new) return -EINVAL; err = mddev_suspend(mddev, true); if (err) return err; WRITE_ONCE(mddev->suspend_lo, new); mddev_resume(mddev); return len; } static struct md_sysfs_entry md_suspend_lo = __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); static ssize_t suspend_hi_show(struct mddev *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)READ_ONCE(mddev->suspend_hi)); } static ssize_t suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long long new; int err; err = kstrtoull(buf, 10, &new); if (err < 0) return err; if (new != (sector_t)new) return -EINVAL; err = mddev_suspend(mddev, true); if (err) return err; WRITE_ONCE(mddev->suspend_hi, new); mddev_resume(mddev); return len; } static struct md_sysfs_entry md_suspend_hi = __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); static ssize_t reshape_position_show(struct mddev *mddev, char *page) { if (mddev->reshape_position != MaxSector) return sprintf(page, "%llu\n", (unsigned long long)mddev->reshape_position); strcpy(page, "none\n"); return 5; } static ssize_t reshape_position_store(struct mddev *mddev, const char *buf, size_t len) { struct md_rdev *rdev; unsigned long long new; int err; err = kstrtoull(buf, 10, &new); if (err < 0) return err; if (new != (sector_t)new) return -EINVAL; err = mddev_lock(mddev); if (err) return err; err = -EBUSY; if (mddev->pers) goto unlock; mddev->reshape_position = new; mddev->delta_disks = 0; mddev->reshape_backwards = 0; mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; mddev->new_chunk_sectors = mddev->chunk_sectors; rdev_for_each(rdev, mddev) rdev->new_data_offset = rdev->data_offset; err = 0; unlock: mddev_unlock(mddev); return err ?: len; } static struct md_sysfs_entry md_reshape_position = __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, reshape_position_store); static ssize_t reshape_direction_show(struct mddev *mddev, char *page) { return sprintf(page, "%s\n", mddev->reshape_backwards ? "backwards" : "forwards"); } static ssize_t reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) { int backwards = 0; int err; if (cmd_match(buf, "forwards")) backwards = 0; else if (cmd_match(buf, "backwards")) backwards = 1; else return -EINVAL; if (mddev->reshape_backwards == backwards) return len; err = mddev_lock(mddev); if (err) return err; /* check if we are allowed to change */ if (mddev->delta_disks) err = -EBUSY; else if (mddev->persistent && mddev->major_version == 0) err = -EINVAL; else mddev->reshape_backwards = backwards; mddev_unlock(mddev); return err ?: len; } static struct md_sysfs_entry md_reshape_direction = __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, reshape_direction_store); static ssize_t array_size_show(struct mddev *mddev, char *page) { if (mddev->external_size) return sprintf(page, "%llu\n", (unsigned long long)mddev->array_sectors/2); else return sprintf(page, "default\n"); } static ssize_t array_size_store(struct mddev *mddev, const char *buf, size_t len) { sector_t sectors; int err; err = mddev_lock(mddev); if (err) return err; /* cluster raid doesn't support change array_sectors */ if (mddev_is_clustered(mddev)) { mddev_unlock(mddev); return -EINVAL; } if (strncmp(buf, "default", 7) == 0) { if (mddev->pers) sectors = mddev->pers->size(mddev, 0, 0); else sectors = mddev->array_sectors; mddev->external_size = 0; } else { if (strict_blocks_to_sectors(buf, §ors) < 0) err = -EINVAL; else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) err = -E2BIG; else mddev->external_size = 1; } if (!err) { mddev->array_sectors = sectors; if (mddev->pers) set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); } mddev_unlock(mddev); return err ?: len; } static struct md_sysfs_entry md_array_size = __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, array_size_store); static ssize_t consistency_policy_show(struct mddev *mddev, char *page) { int ret; if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { ret = sprintf(page, "journal\n"); } else if (test_bit(MD_HAS_PPL, &mddev->flags)) { ret = sprintf(page, "ppl\n"); } else if (mddev->bitmap) { ret = sprintf(page, "bitmap\n"); } else if (mddev->pers) { if (mddev->pers->sync_request) ret = sprintf(page, "resync\n"); else ret = sprintf(page, "none\n"); } else { ret = sprintf(page, "unknown\n"); } return ret; } static ssize_t consistency_policy_store(struct mddev *mddev, const char *buf, size_t len) { int err = 0; if (mddev->pers) { if (mddev->pers->change_consistency_policy) err = mddev->pers->change_consistency_policy(mddev, buf); else err = -EBUSY; } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) { set_bit(MD_HAS_PPL, &mddev->flags); } else { err = -EINVAL; } return err ? err : len; } static struct md_sysfs_entry md_consistency_policy = __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show, consistency_policy_store); static ssize_t fail_last_dev_show(struct mddev *mddev, char *page) { return sprintf(page, "%d\n", mddev->fail_last_dev); } /* * Setting fail_last_dev to true to allow last device to be forcibly removed * from RAID1/RAID10. */ static ssize_t fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len) { int ret; bool value; ret = kstrtobool(buf, &value); if (ret) return ret; if (value != mddev->fail_last_dev) mddev->fail_last_dev = value; return len; } static struct md_sysfs_entry md_fail_last_dev = __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, fail_last_dev_store); static ssize_t serialize_policy_show(struct mddev *mddev, char *page) { if (mddev->pers == NULL || (mddev->pers->level != 1)) return sprintf(page, "n/a\n"); else return sprintf(page, "%d\n", mddev->serialize_policy); } /* * Setting serialize_policy to true to enforce write IO is not reordered * for raid1. */ static ssize_t serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) { int err; bool value; err = kstrtobool(buf, &value); if (err) return err; if (value == mddev->serialize_policy) return len; err = mddev_suspend_and_lock(mddev); if (err) return err; if (mddev->pers == NULL || (mddev->pers->level != 1)) { pr_err("md: serialize_policy is only effective for raid1\n"); err = -EINVAL; goto unlock; } if (value) mddev_create_serial_pool(mddev, NULL); else mddev_destroy_serial_pool(mddev, NULL); mddev->serialize_policy = value; unlock: mddev_unlock_and_resume(mddev); return err ?: len; } static struct md_sysfs_entry md_serialize_policy = __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, serialize_policy_store); static struct attribute *md_default_attrs[] = { &md_level.attr, &md_layout.attr, &md_raid_disks.attr, &md_uuid.attr, &md_chunk_size.attr, &md_size.attr, &md_resync_start.attr, &md_metadata.attr, &md_new_device.attr, &md_safe_delay.attr, &md_array_state.attr, &md_reshape_position.attr, &md_reshape_direction.attr, &md_array_size.attr, &max_corr_read_errors.attr, &md_consistency_policy.attr, &md_fail_last_dev.attr, &md_serialize_policy.attr, NULL, }; static const struct attribute_group md_default_group = { .attrs = md_default_attrs, }; static struct attribute *md_redundancy_attrs[] = { &md_scan_mode.attr, &md_last_scan_mode.attr, &md_mismatches.attr, &md_sync_min.attr, &md_sync_max.attr, &md_sync_speed.attr, &md_sync_force_parallel.attr, &md_sync_completed.attr, &md_min_sync.attr, &md_max_sync.attr, &md_suspend_lo.attr, &md_suspend_hi.attr, &md_bitmap.attr, &md_degraded.attr, NULL, }; static const struct attribute_group md_redundancy_group = { .name = NULL, .attrs = md_redundancy_attrs, }; static const struct attribute_group *md_attr_groups[] = { &md_default_group, &md_bitmap_group, NULL, }; static ssize_t md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); struct mddev *mddev = container_of(kobj, struct mddev, kobj); ssize_t rv; if (!entry->show) return -EIO; spin_lock(&all_mddevs_lock); if (!mddev_get(mddev)) { spin_unlock(&all_mddevs_lock); return -EBUSY; } spin_unlock(&all_mddevs_lock); rv = entry->show(mddev, page); mddev_put(mddev); return rv; } static ssize_t md_attr_store(struct kobject *kobj, struct attribute *attr, const char *page, size_t length) { struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); struct mddev *mddev = container_of(kobj, struct mddev, kobj); ssize_t rv; if (!entry->store) return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; spin_lock(&all_mddevs_lock); if (!mddev_get(mddev)) { spin_unlock(&all_mddevs_lock); return -EBUSY; } spin_unlock(&all_mddevs_lock); rv = entry->store(mddev, page, length); mddev_put(mddev); return rv; } static void md_kobj_release(struct kobject *ko) { struct mddev *mddev = container_of(ko, struct mddev, kobj); if (mddev->sysfs_state) sysfs_put(mddev->sysfs_state); if (mddev->sysfs_level) sysfs_put(mddev->sysfs_level); del_gendisk(mddev->gendisk); put_disk(mddev->gendisk); } static const struct sysfs_ops md_sysfs_ops = { .show = md_attr_show, .store = md_attr_store, }; static const struct kobj_type md_ktype = { .release = md_kobj_release, .sysfs_ops = &md_sysfs_ops, .default_groups = md_attr_groups, }; int mdp_major = 0; /* stack the limit for all rdevs into lim */ void mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim) { struct md_rdev *rdev; rdev_for_each(rdev, mddev) { queue_limits_stack_bdev(lim, rdev->bdev, rdev->data_offset, mddev->gendisk->disk_name); } } EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits); /* apply the extra stacking limits from a new rdev into mddev */ int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev) { struct queue_limits lim; if (mddev_is_dm(mddev)) return 0; lim = queue_limits_start_update(mddev->gendisk->queue); queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset, mddev->gendisk->disk_name); return queue_limits_commit_update(mddev->gendisk->queue, &lim); } EXPORT_SYMBOL_GPL(mddev_stack_new_rdev); /* update the optimal I/O size after a reshape */ void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes) { struct queue_limits lim; if (mddev_is_dm(mddev)) return; /* don't bother updating io_opt if we can't suspend the array */ if (mddev_suspend(mddev, false) < 0) return; lim = queue_limits_start_update(mddev->gendisk->queue); lim.io_opt = lim.io_min * nr_stripes; queue_limits_commit_update(mddev->gendisk->queue, &lim); mddev_resume(mddev); } EXPORT_SYMBOL_GPL(mddev_update_io_opt); static void mddev_delayed_delete(struct work_struct *ws) { struct mddev *mddev = container_of(ws, struct mddev, del_work); kobject_put(&mddev->kobj); } struct mddev *md_alloc(dev_t dev, char *name) { /* * If dev is zero, name is the name of a device to allocate with * an arbitrary minor number. It will be "md_???" * If dev is non-zero it must be a device number with a MAJOR of * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then * the device is being created by opening a node in /dev. * If "name" is not NULL, the device is being created by * writing to /sys/module/md_mod/parameters/new_array. */ static DEFINE_MUTEX(disks_mutex); struct mddev *mddev; struct gendisk *disk; int partitioned; int shift; int unit; int error ; /* * Wait for any previous instance of this device to be completely * removed (mddev_delayed_delete). */ flush_workqueue(md_misc_wq); mutex_lock(&disks_mutex); mddev = mddev_alloc(dev); if (IS_ERR(mddev)) { error = PTR_ERR(mddev); goto out_unlock; } partitioned = (MAJOR(mddev->unit) != MD_MAJOR); shift = partitioned ? MdpMinorShift : 0; unit = MINOR(mddev->unit) >> shift; if (name && !dev) { /* Need to ensure that 'name' is not a duplicate. */ struct mddev *mddev2; spin_lock(&all_mddevs_lock); list_for_each_entry(mddev2, &all_mddevs, all_mddevs) if (mddev2->gendisk && strcmp(mddev2->gendisk->disk_name, name) == 0) { spin_unlock(&all_mddevs_lock); error = -EEXIST; goto out_free_mddev; } spin_unlock(&all_mddevs_lock); } if (name && dev) /* * Creating /dev/mdNNN via "newarray", so adjust hold_active. */ mddev->hold_active = UNTIL_STOP; disk = blk_alloc_disk(NULL, NUMA_NO_NODE); if (IS_ERR(disk)) { error = PTR_ERR(disk); goto out_free_mddev; } disk->major = MAJOR(mddev->unit); disk->first_minor = unit << shift; disk->minors = 1 << shift; if (name) strcpy(disk->disk_name, name); else if (partitioned) sprintf(disk->disk_name, "md_d%d", unit); else sprintf(disk->disk_name, "md%d", unit); disk->fops = &md_fops; disk->private_data = mddev; blk_queue_write_cache(disk->queue, true, true); disk->events |= DISK_EVENT_MEDIA_CHANGE; mddev->gendisk = disk; error = add_disk(disk); if (error) goto out_put_disk; kobject_init(&mddev->kobj, &md_ktype); error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); if (error) { /* * The disk is already live at this point. Clear the hold flag * and let mddev_put take care of the deletion, as it isn't any * different from a normal close on last release now. */ mddev->hold_active = 0; mutex_unlock(&disks_mutex); mddev_put(mddev); return ERR_PTR(error); } kobject_uevent(&mddev->kobj, KOBJ_ADD); mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); mutex_unlock(&disks_mutex); return mddev; out_put_disk: put_disk(disk); out_free_mddev: mddev_free(mddev); out_unlock: mutex_unlock(&disks_mutex); return ERR_PTR(error); } static int md_alloc_and_put(dev_t dev, char *name) { struct mddev *mddev = md_alloc(dev, name); if (IS_ERR(mddev)) return PTR_ERR(mddev); mddev_put(mddev); return 0; } static void md_probe(dev_t dev) { if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512) return; if (create_on_open) md_alloc_and_put(dev, NULL); } static int add_named_array(const char *val, const struct kernel_param *kp) { /* * val must be "md_*" or "mdNNN". * For "md_*" we allocate an array with a large free minor number, and * set the name to val. val must not already be an active name. * For "mdNNN" we allocate an array with the minor number NNN * which must not already be in use. */ int len = strlen(val); char buf[DISK_NAME_LEN]; unsigned long devnum; while (len && val[len-1] == '\n') len--; if (len >= DISK_NAME_LEN) return -E2BIG; strscpy(buf, val, len+1); if (strncmp(buf, "md_", 3) == 0) return md_alloc_and_put(0, buf); if (strncmp(buf, "md", 2) == 0 && isdigit(buf[2]) && kstrtoul(buf+2, 10, &devnum) == 0 && devnum <= MINORMASK) return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL); return -EINVAL; } static void md_safemode_timeout(struct timer_list *t) { struct mddev *mddev = from_timer(mddev, t, safemode_timer); mddev->safemode = 1; if (mddev->external) sysfs_notify_dirent_safe(mddev->sysfs_state); md_wakeup_thread(mddev->thread); } static int start_dirty_degraded; int md_run(struct mddev *mddev) { int err; struct md_rdev *rdev; struct md_personality *pers; bool nowait = true; if (list_empty(&mddev->disks)) /* cannot run an array with no devices.. */ return -EINVAL; if (mddev->pers) return -EBUSY; /* Cannot run until previous stop completes properly */ if (mddev->sysfs_active) return -EBUSY; /* * Analyze all RAID superblock(s) */ if (!mddev->raid_disks) { if (!mddev->persistent) return -EINVAL; err = analyze_sbs(mddev); if (err) return -EINVAL; } if (mddev->level != LEVEL_NONE) request_module("md-level-%d", mddev->level); else if (mddev->clevel[0]) request_module("md-%s", mddev->clevel); /* * Drop all container device buffers, from now on * the only valid external interface is through the md * device. */ mddev->has_superblocks = false; rdev_for_each(rdev, mddev) { if (test_bit(Faulty, &rdev->flags)) continue; sync_blockdev(rdev->bdev); invalidate_bdev(rdev->bdev); if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) { mddev->ro = MD_RDONLY; if (!mddev_is_dm(mddev)) set_disk_ro(mddev->gendisk, 1); } if (rdev->sb_page) mddev->has_superblocks = true; /* perform some consistency tests on the device. * We don't want the data to overlap the metadata, * Internal Bitmap issues have been handled elsewhere. */ if (rdev->meta_bdev) { /* Nothing to check */; } else if (rdev->data_offset < rdev->sb_start) { if (mddev->dev_sectors && rdev->data_offset + mddev->dev_sectors > rdev->sb_start) { pr_warn("md: %s: data overlaps metadata\n", mdname(mddev)); return -EINVAL; } } else { if (rdev->sb_start + rdev->sb_size/512 > rdev->data_offset) { pr_warn("md: %s: metadata overlaps data\n", mdname(mddev)); return -EINVAL; } } sysfs_notify_dirent_safe(rdev->sysfs_state); nowait = nowait && bdev_nowait(rdev->bdev); } if (!bioset_initialized(&mddev->bio_set)) { err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (err) return err; } if (!bioset_initialized(&mddev->sync_set)) { err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (err) goto exit_bio_set; } if (!bioset_initialized(&mddev->io_clone_set)) { err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE, offsetof(struct md_io_clone, bio_clone), 0); if (err) goto exit_sync_set; } spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); if (!pers || !try_module_get(pers->owner)) { spin_unlock(&pers_lock); if (mddev->level != LEVEL_NONE) pr_warn("md: personality for level %d is not loaded!\n", mddev->level); else pr_warn("md: personality for level %s is not loaded!\n", mddev->clevel); err = -EINVAL; goto abort; } spin_unlock(&pers_lock); if (mddev->level != pers->level) { mddev->level = pers->level; mddev->new_level = pers->level; } strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); if (mddev->reshape_position != MaxSector && pers->start_reshape == NULL) { /* This personality cannot handle reshaping... */ module_put(pers->owner); err = -EINVAL; goto abort; } if (pers->sync_request) { /* Warn if this is a potentially silly * configuration. */ struct md_rdev *rdev2; int warned = 0; rdev_for_each(rdev, mddev) rdev_for_each(rdev2, mddev) { if (rdev < rdev2 && rdev->bdev->bd_disk == rdev2->bdev->bd_disk) { pr_warn("%s: WARNING: %pg appears to be on the same physical disk as %pg.\n", mdname(mddev), rdev->bdev, rdev2->bdev); warned = 1; } } if (warned) pr_warn("True protection against single-disk failure might be compromised.\n"); } /* dm-raid expect sync_thread to be frozen until resume */ if (mddev->gendisk) mddev->recovery = 0; /* may be over-ridden by personality */ mddev->resync_max_sectors = mddev->dev_sectors; mddev->ok_start_degraded = start_dirty_degraded; if (start_readonly && md_is_rdwr(mddev)) mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */ err = pers->run(mddev); if (err) pr_warn("md: pers->run() failed ...\n"); else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { WARN_ONCE(!mddev->external_size, "%s: default size too small, but 'external_size' not in effect?\n", __func__); pr_warn("md: invalid array_size %llu > default size %llu\n", (unsigned long long)mddev->array_sectors / 2, (unsigned long long)pers->size(mddev, 0, 0) / 2); err = -EINVAL; } if (err == 0 && pers->sync_request && (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { struct bitmap *bitmap; bitmap = md_bitmap_create(mddev, -1); if (IS_ERR(bitmap)) { err = PTR_ERR(bitmap); pr_warn("%s: failed to create bitmap (%d)\n", mdname(mddev), err); } else mddev->bitmap = bitmap; } if (err) goto bitmap_abort; if (mddev->bitmap_info.max_write_behind > 0) { bool create_pool = false; rdev_for_each(rdev, mddev) { if (test_bit(WriteMostly, &rdev->flags) && rdev_init_serial(rdev)) create_pool = true; } if (create_pool && mddev->serial_info_pool == NULL) { mddev->serial_info_pool = mempool_create_kmalloc_pool(NR_SERIAL_INFOS, sizeof(struct serial_info)); if (!mddev->serial_info_pool) { err = -ENOMEM; goto bitmap_abort; } } } if (!mddev_is_dm(mddev)) { struct request_queue *q = mddev->gendisk->queue; bool nonrot = true; rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 && !bdev_nonrot(rdev->bdev)) { nonrot = false; break; } } if (mddev->degraded) nonrot = false; if (nonrot) blk_queue_flag_set(QUEUE_FLAG_NONROT, q); else blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); blk_queue_flag_set(QUEUE_FLAG_IO_STAT, q); /* Set the NOWAIT flags if all underlying devices support it */ if (nowait) blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q); } if (pers->sync_request) { if (mddev->kobj.sd && sysfs_create_group(&mddev->kobj, &md_redundancy_group)) pr_warn("md: cannot register extra attributes for %s\n", mdname(mddev)); mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); } else if (mddev->ro == MD_AUTO_READ) mddev->ro = MD_RDWR; atomic_set(&mddev->max_corr_read_errors, MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); mddev->safemode = 0; if (mddev_is_clustered(mddev)) mddev->safemode_delay = 0; else mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; mddev->in_sync = 1; smp_wmb(); spin_lock(&mddev->lock); mddev->pers = pers; spin_unlock(&mddev->lock); rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0) sysfs_link_rdev(mddev, rdev); /* failure here is OK */ if (mddev->degraded && md_is_rdwr(mddev)) /* This ensures that recovering status is reported immediately * via sysfs - until a lack of spares is confirmed. */ set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); if (mddev->sb_flags) md_update_sb(mddev, 0); md_new_event(); return 0; bitmap_abort: mddev_detach(mddev); if (mddev->private) pers->free(mddev, mddev->private); mddev->private = NULL; module_put(pers->owner); md_bitmap_destroy(mddev); abort: bioset_exit(&mddev->io_clone_set); exit_sync_set: bioset_exit(&mddev->sync_set); exit_bio_set: bioset_exit(&mddev->bio_set); return err; } EXPORT_SYMBOL_GPL(md_run); int do_md_run(struct mddev *mddev) { int err; set_bit(MD_NOT_READY, &mddev->flags); err = md_run(mddev); if (err) goto out; err = md_bitmap_load(mddev); if (err) { md_bitmap_destroy(mddev); goto out; } if (mddev_is_clustered(mddev)) md_allow_write(mddev); /* run start up tasks that require md_thread */ md_start(mddev); md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); clear_bit(MD_NOT_READY, &mddev->flags); mddev->changed = 1; kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); sysfs_notify_dirent_safe(mddev->sysfs_state); sysfs_notify_dirent_safe(mddev->sysfs_action); sysfs_notify_dirent_safe(mddev->sysfs_degraded); out: clear_bit(MD_NOT_READY, &mddev->flags); return err; } int md_start(struct mddev *mddev) { int ret = 0; if (mddev->pers->start) { set_bit(MD_RECOVERY_WAIT, &mddev->recovery); ret = mddev->pers->start(mddev); clear_bit(MD_RECOVERY_WAIT, &mddev->recovery); md_wakeup_thread(mddev->sync_thread); } return ret; } EXPORT_SYMBOL_GPL(md_start); static int restart_array(struct mddev *mddev) { struct gendisk *disk = mddev->gendisk; struct md_rdev *rdev; bool has_journal = false; bool has_readonly = false; /* Complain if it has no devices */ if (list_empty(&mddev->disks)) return -ENXIO; if (!mddev->pers) return -EINVAL; if (md_is_rdwr(mddev)) return -EBUSY; rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) { if (test_bit(Journal, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) has_journal = true; if (rdev_read_only(rdev)) has_readonly = true; } rcu_read_unlock(); if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal) /* Don't restart rw with journal missing/faulty */ return -EINVAL; if (has_readonly) return -EROFS; mddev->safemode = 0; mddev->ro = MD_RDWR; set_disk_ro(disk, 0); pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); /* Kick recovery or resync if necessary */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->sync_thread); sysfs_notify_dirent_safe(mddev->sysfs_state); return 0; } static void md_clean(struct mddev *mddev) { mddev->array_sectors = 0; mddev->external_size = 0; mddev->dev_sectors = 0; mddev->raid_disks = 0; mddev->recovery_cp = 0; mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->reshape_position = MaxSector; /* we still need mddev->external in export_rdev, do not clear it yet */ mddev->persistent = 0; mddev->level = LEVEL_NONE; mddev->clevel[0] = 0; /* * Don't clear MD_CLOSING, or mddev can be opened again. * 'hold_active != 0' means mddev is still in the creation * process and will be used later. */ if (mddev->hold_active) mddev->flags = 0; else mddev->flags &= BIT_ULL_MASK(MD_CLOSING); mddev->sb_flags = 0; mddev->ro = MD_RDWR; mddev->metadata_type[0] = 0; mddev->chunk_sectors = 0; mddev->ctime = mddev->utime = 0; mddev->layout = 0; mddev->max_disks = 0; mddev->events = 0; mddev->can_decrease_events = 0; mddev->delta_disks = 0; mddev->reshape_backwards = 0; mddev->new_level = LEVEL_NONE; mddev->new_layout = 0; mddev->new_chunk_sectors = 0; mddev->curr_resync = MD_RESYNC_NONE; atomic64_set(&mddev->resync_mismatches, 0); mddev->suspend_lo = mddev->suspend_hi = 0; mddev->sync_speed_min = mddev->sync_speed_max = 0; mddev->recovery = 0; mddev->in_sync = 0; mddev->changed = 0; mddev->degraded = 0; mddev->safemode = 0; mddev->private = NULL; mddev->cluster_info = NULL; mddev->bitmap_info.offset = 0; mddev->bitmap_info.default_offset = 0; mddev->bitmap_info.default_space = 0; mddev->bitmap_info.chunksize = 0; mddev->bitmap_info.daemon_sleep = 0; mddev->bitmap_info.max_write_behind = 0; mddev->bitmap_info.nodes = 0; } static void __md_stop_writes(struct mddev *mddev) { del_timer_sync(&mddev->safemode_timer); if (mddev->pers && mddev->pers->quiesce) { mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } md_bitmap_flush(mddev); if (md_is_rdwr(mddev) && ((!mddev->in_sync && !mddev_is_clustered(mddev)) || mddev->sb_flags)) { /* mark array as shutdown cleanly */ if (!mddev_is_clustered(mddev)) mddev->in_sync = 1; md_update_sb(mddev, 1); } /* disable policy to guarantee rdevs free resources for serialization */ mddev->serialize_policy = 0; mddev_destroy_serial_pool(mddev, NULL); } void md_stop_writes(struct mddev *mddev) { mddev_lock_nointr(mddev); set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); stop_sync_thread(mddev, true, false); __md_stop_writes(mddev); mddev_unlock(mddev); } EXPORT_SYMBOL_GPL(md_stop_writes); static void mddev_detach(struct mddev *mddev) { md_bitmap_wait_behind_writes(mddev); if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } md_unregister_thread(mddev, &mddev->thread); /* the unplug fn references 'conf' */ if (!mddev_is_dm(mddev)) blk_sync_queue(mddev->gendisk->queue); } static void __md_stop(struct mddev *mddev) { struct md_personality *pers = mddev->pers; md_bitmap_destroy(mddev); mddev_detach(mddev); spin_lock(&mddev->lock); mddev->pers = NULL; spin_unlock(&mddev->lock); if (mddev->private) pers->free(mddev, mddev->private); mddev->private = NULL; if (pers->sync_request && mddev->to_remove == NULL) mddev->to_remove = &md_redundancy_group; module_put(pers->owner); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); bioset_exit(&mddev->io_clone_set); } void md_stop(struct mddev *mddev) { lockdep_assert_held(&mddev->reconfig_mutex); /* stop the array and free an attached data structures. * This is called from dm-raid */ __md_stop_writes(mddev); __md_stop(mddev); } EXPORT_SYMBOL_GPL(md_stop); /* ensure 'mddev->pers' exist before calling md_set_readonly() */ static int md_set_readonly(struct mddev *mddev) { int err = 0; int did_freeze = 0; if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) return -EBUSY; if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { did_freeze = 1; set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } stop_sync_thread(mddev, false, false); wait_event(mddev->sb_wait, !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); mddev_lock_nointr(mddev); if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { pr_warn("md: %s still in use.\n",mdname(mddev)); err = -EBUSY; goto out; } __md_stop_writes(mddev); if (mddev->ro == MD_RDONLY) { err = -ENXIO; goto out; } mddev->ro = MD_RDONLY; set_disk_ro(mddev->gendisk, 1); out: if (!err || did_freeze) { clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); sysfs_notify_dirent_safe(mddev->sysfs_state); } return err; } /* mode: * 0 - completely stop and dis-assemble array * 2 - stop but do not disassemble array */ static int do_md_stop(struct mddev *mddev, int mode) { struct gendisk *disk = mddev->gendisk; struct md_rdev *rdev; int did_freeze = 0; if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { did_freeze = 1; set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } stop_sync_thread(mddev, true, false); if (mddev->sysfs_active || test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { pr_warn("md: %s still in use.\n",mdname(mddev)); if (did_freeze) { clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } return -EBUSY; } if (mddev->pers) { if (!md_is_rdwr(mddev)) set_disk_ro(disk, 0); __md_stop_writes(mddev); __md_stop(mddev); /* tell userspace to handle 'inactive' */ sysfs_notify_dirent_safe(mddev->sysfs_state); rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0) sysfs_unlink_rdev(mddev, rdev); set_capacity_and_notify(disk, 0); mddev->changed = 1; if (!md_is_rdwr(mddev)) mddev->ro = MD_RDWR; } /* * Free resources if final stop */ if (mode == 0) { pr_info("md: %s stopped.\n", mdname(mddev)); if (mddev->bitmap_info.file) { struct file *f = mddev->bitmap_info.file; spin_lock(&mddev->lock); mddev->bitmap_info.file = NULL; spin_unlock(&mddev->lock); fput(f); } mddev->bitmap_info.offset = 0; export_array(mddev); md_clean(mddev); if (mddev->hold_active == UNTIL_STOP) mddev->hold_active = 0; } md_new_event(); sysfs_notify_dirent_safe(mddev->sysfs_state); return 0; } #ifndef MODULE static void autorun_array(struct mddev *mddev) { struct md_rdev *rdev; int err; if (list_empty(&mddev->disks)) return; pr_info("md: running: "); rdev_for_each(rdev, mddev) { pr_cont("<%pg>", rdev->bdev); } pr_cont("\n"); err = do_md_run(mddev); if (err) { pr_warn("md: do_md_run() returned %d\n", err); do_md_stop(mddev, 0); } } /* * lets try to run arrays based on all disks that have arrived * until now. (those are in pending_raid_disks) * * the method: pick the first pending disk, collect all disks with * the same UUID, remove all from the pending list and put them into * the 'same_array' list. Then order this list based on superblock * update time (freshest comes first), kick out 'old' disks and * compare superblocks. If everything's fine then run it. * * If "unit" is allocated, then bump its reference count */ static void autorun_devices(int part) { struct md_rdev *rdev0, *rdev, *tmp; struct mddev *mddev; pr_info("md: autorun ...\n"); while (!list_empty(&pending_raid_disks)) { int unit; dev_t dev; LIST_HEAD(candidates); rdev0 = list_entry(pending_raid_disks.next, struct md_rdev, same_set); pr_debug("md: considering %pg ...\n", rdev0->bdev); INIT_LIST_HEAD(&candidates); rdev_for_each_list(rdev, tmp, &pending_raid_disks) if (super_90_load(rdev, rdev0, 0) >= 0) { pr_debug("md: adding %pg ...\n", rdev->bdev); list_move(&rdev->same_set, &candidates); } /* * now we have a set of devices, with all of them having * mostly sane superblocks. It's time to allocate the * mddev. */ if (part) { dev = MKDEV(mdp_major, rdev0->preferred_minor << MdpMinorShift); unit = MINOR(dev) >> MdpMinorShift; } else { dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); unit = MINOR(dev); } if (rdev0->preferred_minor != unit) { pr_warn("md: unit number in %pg is bad: %d\n", rdev0->bdev, rdev0->preferred_minor); break; } mddev = md_alloc(dev, NULL); if (IS_ERR(mddev)) break; if (mddev_suspend_and_lock(mddev)) pr_warn("md: %s locked, cannot run\n", mdname(mddev)); else if (mddev->raid_disks || mddev->major_version || !list_empty(&mddev->disks)) { pr_warn("md: %s already running, cannot run %pg\n", mdname(mddev), rdev0->bdev); mddev_unlock_and_resume(mddev); } else { pr_debug("md: created %s\n", mdname(mddev)); mddev->persistent = 1; rdev_for_each_list(rdev, tmp, &candidates) { list_del_init(&rdev->same_set); if (bind_rdev_to_array(rdev, mddev)) export_rdev(rdev, mddev); } autorun_array(mddev); mddev_unlock_and_resume(mddev); } /* on success, candidates will be empty, on error * it won't... */ rdev_for_each_list(rdev, tmp, &candidates) { list_del_init(&rdev->same_set); export_rdev(rdev, mddev); } mddev_put(mddev); } pr_info("md: ... autorun DONE.\n"); } #endif /* !MODULE */ static int get_version(void __user *arg) { mdu_version_t ver; ver.major = MD_MAJOR_VERSION; ver.minor = MD_MINOR_VERSION; ver.patchlevel = MD_PATCHLEVEL_VERSION; if (copy_to_user(arg, &ver, sizeof(ver))) return -EFAULT; return 0; } static int get_array_info(struct mddev *mddev, void __user *arg) { mdu_array_info_t info; int nr,working,insync,failed,spare; struct md_rdev *rdev; nr = working = insync = failed = spare = 0; rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) { nr++; if (test_bit(Faulty, &rdev->flags)) failed++; else { working++; if (test_bit(In_sync, &rdev->flags)) insync++; else if (test_bit(Journal, &rdev->flags)) /* TODO: add journal count to md_u.h */ ; else spare++; } } rcu_read_unlock(); info.major_version = mddev->major_version; info.minor_version = mddev->minor_version; info.patch_version = MD_PATCHLEVEL_VERSION; info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); info.level = mddev->level; info.size = mddev->dev_sectors / 2; if (info.size != mddev->dev_sectors / 2) /* overflow */ info.size = -1; info.nr_disks = nr; info.raid_disks = mddev->raid_disks; info.md_minor = mddev->md_minor; info.not_persistent= !mddev->persistent; info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); info.state = 0; if (mddev->in_sync) info.state = (1<<MD_SB_CLEAN); if (mddev->bitmap && mddev->bitmap_info.offset) info.state |= (1<<MD_SB_BITMAP_PRESENT); if (mddev_is_clustered(mddev)) info.state |= (1<<MD_SB_CLUSTERED); info.active_disks = insync; info.working_disks = working; info.failed_disks = failed; info.spare_disks = spare; info.layout = mddev->layout; info.chunk_size = mddev->chunk_sectors << 9; if (copy_to_user(arg, &info, sizeof(info))) return -EFAULT; return 0; } static int get_bitmap_file(struct mddev *mddev, void __user * arg) { mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ char *ptr; int err; file = kzalloc(sizeof(*file), GFP_NOIO); if (!file) return -ENOMEM; err = 0; spin_lock(&mddev->lock); /* bitmap enabled */ if (mddev->bitmap_info.file) { ptr = file_path(mddev->bitmap_info.file, file->pathname, sizeof(file->pathname)); if (IS_ERR(ptr)) err = PTR_ERR(ptr); else memmove(file->pathname, ptr, sizeof(file->pathname)-(ptr-file->pathname)); } spin_unlock(&mddev->lock); if (err == 0 && copy_to_user(arg, file, sizeof(*file))) err = -EFAULT; kfree(file); return err; } static int get_disk_info(struct mddev *mddev, void __user * arg) { mdu_disk_info_t info; struct md_rdev *rdev; if (copy_from_user(&info, arg, sizeof(info))) return -EFAULT; rcu_read_lock(); rdev = md_find_rdev_nr_rcu(mddev, info.number); if (rdev) { info.major = MAJOR(rdev->bdev->bd_dev); info.minor = MINOR(rdev->bdev->bd_dev); info.raid_disk = rdev->raid_disk; info.state = 0; if (test_bit(Faulty, &rdev->flags)) info.state |= (1<<MD_DISK_FAULTY); else if (test_bit(In_sync, &rdev->flags)) { info.state |= (1<<MD_DISK_ACTIVE); info.state |= (1<<MD_DISK_SYNC); } if (test_bit(Journal, &rdev->flags)) info.state |= (1<<MD_DISK_JOURNAL); if (test_bit(WriteMostly, &rdev->flags)) info.state |= (1<<MD_DISK_WRITEMOSTLY); if (test_bit(FailFast, &rdev->flags)) info.state |= (1<<MD_DISK_FAILFAST); } else { info.major = info.minor = 0; info.raid_disk = -1; info.state = (1<<MD_DISK_REMOVED); } rcu_read_unlock(); if (copy_to_user(arg, &info, sizeof(info))) return -EFAULT; return 0; } int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) { struct md_rdev *rdev; dev_t dev = MKDEV(info->major,info->minor); if (mddev_is_clustered(mddev) && !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { pr_warn("%s: Cannot add to clustered mddev.\n", mdname(mddev)); return -EINVAL; } if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) return -EOVERFLOW; if (!mddev->raid_disks) { int err; /* expecting a device which has a superblock */ rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); if (IS_ERR(rdev)) { pr_warn("md: md_import_device returned %ld\n", PTR_ERR(rdev)); return PTR_ERR(rdev); } if (!list_empty(&mddev->disks)) { struct md_rdev *rdev0 = list_entry(mddev->disks.next, struct md_rdev, same_set); err = super_types[mddev->major_version] .load_super(rdev, rdev0, mddev->minor_version); if (err < 0) { pr_warn("md: %pg has different UUID to %pg\n", rdev->bdev, rdev0->bdev); export_rdev(rdev, mddev); return -EINVAL; } } err = bind_rdev_to_array(rdev, mddev); if (err) export_rdev(rdev, mddev); return err; } /* * md_add_new_disk can be used once the array is assembled * to add "hot spares". They must already have a superblock * written */ if (mddev->pers) { int err; if (!mddev->pers->hot_add_disk) { pr_warn("%s: personality does not support diskops!\n", mdname(mddev)); return -EINVAL; } if (mddev->persistent) rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); else rdev = md_import_device(dev, -1, -1); if (IS_ERR(rdev)) { pr_warn("md: md_import_device returned %ld\n", PTR_ERR(rdev)); return PTR_ERR(rdev); } /* set saved_raid_disk if appropriate */ if (!mddev->persistent) { if (info->state & (1<<MD_DISK_SYNC) && info->raid_disk < mddev->raid_disks) { rdev->raid_disk = info->raid_disk; clear_bit(Bitmap_sync, &rdev->flags); } else rdev->raid_disk = -1; rdev->saved_raid_disk = rdev->raid_disk; } else super_types[mddev->major_version]. validate_super(mddev, NULL/*freshest*/, rdev); if ((info->state & (1<<MD_DISK_SYNC)) && rdev->raid_disk != info->raid_disk) { /* This was a hot-add request, but events doesn't * match, so reject it. */ export_rdev(rdev, mddev); return -EINVAL; } clear_bit(In_sync, &rdev->flags); /* just to be sure */ if (info->state & (1<<MD_DISK_WRITEMOSTLY)) set_bit(WriteMostly, &rdev->flags); else clear_bit(WriteMostly, &rdev->flags); if (info->state & (1<<MD_DISK_FAILFAST)) set_bit(FailFast, &rdev->flags); else clear_bit(FailFast, &rdev->flags); if (info->state & (1<<MD_DISK_JOURNAL)) { struct md_rdev *rdev2; bool has_journal = false; /* make sure no existing journal disk */ rdev_for_each(rdev2, mddev) { if (test_bit(Journal, &rdev2->flags)) { has_journal = true; break; } } if (has_journal || mddev->bitmap) { export_rdev(rdev, mddev); return -EBUSY; } set_bit(Journal, &rdev->flags); } /* * check whether the device shows up in other nodes */ if (mddev_is_clustered(mddev)) { if (info->state & (1 << MD_DISK_CANDIDATE)) set_bit(Candidate, &rdev->flags); else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { /* --add initiated by this node */ err = md_cluster_ops->add_new_disk(mddev, rdev); if (err) { export_rdev(rdev, mddev); return err; } } } rdev->raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); if (err) export_rdev(rdev, mddev); if (mddev_is_clustered(mddev)) { if (info->state & (1 << MD_DISK_CANDIDATE)) { if (!err) { err = md_cluster_ops->new_disk_ack(mddev, err == 0); if (err) md_kick_rdev_from_array(rdev); } } else { if (err) md_cluster_ops->add_new_disk_cancel(mddev); else err = add_bound_rdev(rdev); } } else if (!err) err = add_bound_rdev(rdev); return err; } /* otherwise, md_add_new_disk is only allowed * for major_version==0 superblocks */ if (mddev->major_version != 0) { pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev)); return -EINVAL; } if (!(info->state & (1<<MD_DISK_FAULTY))) { int err; rdev = md_import_device(dev, -1, 0); if (IS_ERR(rdev)) { pr_warn("md: error, md_import_device() returned %ld\n", PTR_ERR(rdev)); return PTR_ERR(rdev); } rdev->desc_nr = info->number; if (info->raid_disk < mddev->raid_disks) rdev->raid_disk = info->raid_disk; else rdev->raid_disk = -1; if (rdev->raid_disk < mddev->raid_disks) if (info->state & (1<<MD_DISK_SYNC)) set_bit(In_sync, &rdev->flags); if (info->state & (1<<MD_DISK_WRITEMOSTLY)) set_bit(WriteMostly, &rdev->flags); if (info->state & (1<<MD_DISK_FAILFAST)) set_bit(FailFast, &rdev->flags); if (!mddev->persistent) { pr_debug("md: nonpersistent superblock ...\n"); rdev->sb_start = bdev_nr_sectors(rdev->bdev); } else rdev->sb_start = calc_dev_sboffset(rdev); rdev->sectors = rdev->sb_start; err = bind_rdev_to_array(rdev, mddev); if (err) { export_rdev(rdev, mddev); return err; } } return 0; } static int hot_remove_disk(struct mddev *mddev, dev_t dev) { struct md_rdev *rdev; if (!mddev->pers) return -ENODEV; rdev = find_rdev(mddev, dev); if (!rdev) return -ENXIO; if (rdev->raid_disk < 0) goto kick_rdev; clear_bit(Blocked, &rdev->flags); remove_and_add_spares(mddev, rdev); if (rdev->raid_disk >= 0) goto busy; kick_rdev: if (mddev_is_clustered(mddev)) { if (md_cluster_ops->remove_disk(mddev, rdev)) goto busy; } md_kick_rdev_from_array(rdev); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); if (!mddev->thread) md_update_sb(mddev, 1); md_new_event(); return 0; busy: pr_debug("md: cannot remove active disk %pg from %s ...\n", rdev->bdev, mdname(mddev)); return -EBUSY; } static int hot_add_disk(struct mddev *mddev, dev_t dev) { int err; struct md_rdev *rdev; if (!mddev->pers) return -ENODEV; if (mddev->major_version != 0) { pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n", mdname(mddev)); return -EINVAL; } if (!mddev->pers->hot_add_disk) { pr_warn("%s: personality does not support diskops!\n", mdname(mddev)); return -EINVAL; } rdev = md_import_device(dev, -1, 0); if (IS_ERR(rdev)) { pr_warn("md: error, md_import_device() returned %ld\n", PTR_ERR(rdev)); return -EINVAL; } if (mddev->persistent) rdev->sb_start = calc_dev_sboffset(rdev); else rdev->sb_start = bdev_nr_sectors(rdev->bdev); rdev->sectors = rdev->sb_start; if (test_bit(Faulty, &rdev->flags)) { pr_warn("md: can not hot-add faulty %pg disk to %s!\n", rdev->bdev, mdname(mddev)); err = -EINVAL; goto abort_export; } clear_bit(In_sync, &rdev->flags); rdev->desc_nr = -1; rdev->saved_raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); if (err) goto abort_export; /* * The rest should better be atomic, we can have disk failures * noticed in interrupt contexts ... */ rdev->raid_disk = -1; set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); if (!mddev->thread) md_update_sb(mddev, 1); /* * If the new disk does not support REQ_NOWAIT, * disable on the whole MD. */ if (!bdev_nowait(rdev->bdev)) { pr_info("%s: Disabling nowait because %pg does not support nowait\n", mdname(mddev), rdev->bdev); blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->gendisk->queue); } /* * Kick recovery, maybe this spare has to be added to the * array immediately. */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_new_event(); return 0; abort_export: export_rdev(rdev, mddev); return err; } static int set_bitmap_file(struct mddev *mddev, int fd) { int err = 0; if (mddev->pers) { if (!mddev->pers->quiesce || !mddev->thread) return -EBUSY; if (mddev->recovery || mddev->sync_thread) return -EBUSY; /* we should be able to change the bitmap.. */ } if (fd >= 0) { struct inode *inode; struct file *f; if (mddev->bitmap || mddev->bitmap_info.file) return -EEXIST; /* cannot add when bitmap is present */ if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) { pr_warn("%s: bitmap files not supported by this kernel\n", mdname(mddev)); return -EINVAL; } pr_warn("%s: using deprecated bitmap file support\n", mdname(mddev)); f = fget(fd); if (f == NULL) { pr_warn("%s: error: failed to get bitmap file\n", mdname(mddev)); return -EBADF; } inode = f->f_mapping->host; if (!S_ISREG(inode->i_mode)) { pr_warn("%s: error: bitmap file must be a regular file\n", mdname(mddev)); err = -EBADF; } else if (!(f->f_mode & FMODE_WRITE)) { pr_warn("%s: error: bitmap file must open for write\n", mdname(mddev)); err = -EBADF; } else if (atomic_read(&inode->i_writecount) != 1) { pr_warn("%s: error: bitmap file is already in use\n", mdname(mddev)); err = -EBUSY; } if (err) { fput(f); return err; } mddev->bitmap_info.file = f; mddev->bitmap_info.offset = 0; /* file overrides offset */ } else if (mddev->bitmap == NULL) return -ENOENT; /* cannot remove what isn't there */ err = 0; if (mddev->pers) { if (fd >= 0) { struct bitmap *bitmap; bitmap = md_bitmap_create(mddev, -1); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; err = md_bitmap_load(mddev); } else err = PTR_ERR(bitmap); if (err) { md_bitmap_destroy(mddev); fd = -1; } } else if (fd < 0) { md_bitmap_destroy(mddev); } } if (fd < 0) { struct file *f = mddev->bitmap_info.file; if (f) { spin_lock(&mddev->lock); mddev->bitmap_info.file = NULL; spin_unlock(&mddev->lock); fput(f); } } return err; } /* * md_set_array_info is used two different ways * The original usage is when creating a new array. * In this usage, raid_disks is > 0 and it together with * level, size, not_persistent,layout,chunksize determine the * shape of the array. * This will always create an array with a type-0.90.0 superblock. * The newer usage is when assembling an array. * In this case raid_disks will be 0, and the major_version field is * use to determine which style super-blocks are to be found on the devices. * The minor and patch _version numbers are also kept incase the * super_block handler wishes to interpret them. */ int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info) { if (info->raid_disks == 0) { /* just setting version number for superblock loading */ if (info->major_version < 0 || info->major_version >= ARRAY_SIZE(super_types) || super_types[info->major_version].name == NULL) { /* maybe try to auto-load a module? */ pr_warn("md: superblock version %d not known\n", info->major_version); return -EINVAL; } mddev->major_version = info->major_version; mddev->minor_version = info->minor_version; mddev->patch_version = info->patch_version; mddev->persistent = !info->not_persistent; /* ensure mddev_put doesn't delete this now that there * is some minimal configuration. */ mddev->ctime = ktime_get_real_seconds(); return 0; } mddev->major_version = MD_MAJOR_VERSION; mddev->minor_version = MD_MINOR_VERSION; mddev->patch_version = MD_PATCHLEVEL_VERSION; mddev->ctime = ktime_get_real_seconds(); mddev->level = info->level; mddev->clevel[0] = 0; mddev->dev_sectors = 2 * (sector_t)info->size; mddev->raid_disks = info->raid_disks; /* don't set md_minor, it is determined by which /dev/md* was * openned */ if (info->state & (1<<MD_SB_CLEAN)) mddev->recovery_cp = MaxSector; else mddev->recovery_cp = 0; mddev->persistent = ! info->not_persistent; mddev->external = 0; mddev->layout = info->layout; if (mddev->level == 0) /* Cannot trust RAID0 layout info here */ mddev->layout = -1; mddev->chunk_sectors = info->chunk_size >> 9; if (mddev->persistent) { mddev->max_disks = MD_SB_DISKS; mddev->flags = 0; mddev->sb_flags = 0; } set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); mddev->bitmap_info.offset = 0; mddev->reshape_position = MaxSector; /* * Generate a 128 bit UUID */ get_random_bytes(mddev->uuid, 16); mddev->new_level = mddev->level; mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->new_layout = mddev->layout; mddev->delta_disks = 0; mddev->reshape_backwards = 0; return 0; } void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) { lockdep_assert_held(&mddev->reconfig_mutex); if (mddev->external_size) return; mddev->array_sectors = array_sectors; } EXPORT_SYMBOL(md_set_array_sectors); static int update_size(struct mddev *mddev, sector_t num_sectors) { struct md_rdev *rdev; int rv; int fit = (num_sectors == 0); sector_t old_dev_sectors = mddev->dev_sectors; if (mddev->pers->resize == NULL) return -EINVAL; /* The "num_sectors" is the number of sectors of each device that * is used. This can only make sense for arrays with redundancy. * linear and raid0 always use whatever space is available. We can only * consider changing this number if no resync or reconstruction is * happening, and if the new size is acceptable. It must fit before the * sb_start or, if that is <data_offset, it must fit before the size * of each device. If num_sectors is zero, we find the largest size * that fits. */ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY; if (!md_is_rdwr(mddev)) return -EROFS; rdev_for_each(rdev, mddev) { sector_t avail = rdev->sectors; if (fit && (num_sectors == 0 || num_sectors > avail)) num_sectors = avail; if (avail < num_sectors) return -ENOSPC; } rv = mddev->pers->resize(mddev, num_sectors); if (!rv) { if (mddev_is_clustered(mddev)) md_cluster_ops->update_size(mddev, old_dev_sectors); else if (!mddev_is_dm(mddev)) set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); } return rv; } static int update_raid_disks(struct mddev *mddev, int raid_disks) { int rv; struct md_rdev *rdev; /* change the number of raid disks */ if (mddev->pers->check_reshape == NULL) return -EINVAL; if (!md_is_rdwr(mddev)) return -EROFS; if (raid_disks <= 0 || (mddev->max_disks && raid_disks >= mddev->max_disks)) return -EINVAL; if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) || mddev->reshape_position != MaxSector) return -EBUSY; rdev_for_each(rdev, mddev) { if (mddev->raid_disks < raid_disks && rdev->data_offset < rdev->new_data_offset) return -EINVAL; if (mddev->raid_disks > raid_disks && rdev->data_offset > rdev->new_data_offset) return -EINVAL; } mddev->delta_disks = raid_disks - mddev->raid_disks; if (mddev->delta_disks < 0) mddev->reshape_backwards = 1; else if (mddev->delta_disks > 0) mddev->reshape_backwards = 0; rv = mddev->pers->check_reshape(mddev); if (rv < 0) { mddev->delta_disks = 0; mddev->reshape_backwards = 0; } return rv; } /* * update_array_info is used to change the configuration of an * on-line array. * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size * fields in the info are checked against the array. * Any differences that cannot be handled will cause an error. * Normally, only one change can be managed at a time. */ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) { int rv = 0; int cnt = 0; int state = 0; /* calculate expected state,ignoring low bits */ if (mddev->bitmap && mddev->bitmap_info.offset) state |= (1 << MD_SB_BITMAP_PRESENT); if (mddev->major_version != info->major_version || mddev->minor_version != info->minor_version || /* mddev->patch_version != info->patch_version || */ mddev->ctime != info->ctime || mddev->level != info->level || /* mddev->layout != info->layout || */ mddev->persistent != !info->not_persistent || mddev->chunk_sectors != info->chunk_size >> 9 || /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ ((state^info->state) & 0xfffffe00) ) return -EINVAL; /* Check there is only one change */ if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) cnt++; if (mddev->raid_disks != info->raid_disks) cnt++; if (mddev->layout != info->layout) cnt++; if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++; if (cnt == 0) return 0; if (cnt > 1) return -EINVAL; if (mddev->layout != info->layout) { /* Change layout * we don't need to do anything at the md level, the * personality will take care of it all. */ if (mddev->pers->check_reshape == NULL) return -EINVAL; else { mddev->new_layout = info->layout; rv = mddev->pers->check_reshape(mddev); if (rv) mddev->new_layout = mddev->layout; return rv; } } if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) rv = update_size(mddev, (sector_t)info->size * 2); if (mddev->raid_disks != info->raid_disks) rv = update_raid_disks(mddev, info->raid_disks); if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { rv = -EINVAL; goto err; } if (mddev->recovery || mddev->sync_thread) { rv = -EBUSY; goto err; } if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { struct bitmap *bitmap; /* add the bitmap */ if (mddev->bitmap) { rv = -EEXIST; goto err; } if (mddev->bitmap_info.default_offset == 0) { rv = -EINVAL; goto err; } mddev->bitmap_info.offset = mddev->bitmap_info.default_offset; mddev->bitmap_info.space = mddev->bitmap_info.default_space; bitmap = md_bitmap_create(mddev, -1); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; rv = md_bitmap_load(mddev); } else rv = PTR_ERR(bitmap); if (rv) md_bitmap_destroy(mddev); } else { /* remove the bitmap */ if (!mddev->bitmap) { rv = -ENOENT; goto err; } if (mddev->bitmap->storage.file) { rv = -EINVAL; goto err; } if (mddev->bitmap_info.nodes) { /* hold PW on all the bitmap lock */ if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n"); rv = -EPERM; md_cluster_ops->unlock_all_bitmaps(mddev); goto err; } mddev->bitmap_info.nodes = 0; md_cluster_ops->leave(mddev); module_put(md_cluster_mod); mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; } md_bitmap_destroy(mddev); mddev->bitmap_info.offset = 0; } } md_update_sb(mddev, 1); return rv; err: return rv; } static int set_disk_faulty(struct mddev *mddev, dev_t dev) { struct md_rdev *rdev; int err = 0; if (mddev->pers == NULL) return -ENODEV; rcu_read_lock(); rdev = md_find_rdev_rcu(mddev, dev); if (!rdev) err = -ENODEV; else { md_error(mddev, rdev); if (test_bit(MD_BROKEN, &mddev->flags)) err = -EBUSY; } rcu_read_unlock(); return err; } /* * We have a problem here : there is no easy way to give a CHS * virtual geometry. We currently pretend that we have a 2 heads * 4 sectors (with a BIG number of cylinders...). This drives * dosfs just mad... ;-) */ static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) { struct mddev *mddev = bdev->bd_disk->private_data; geo->heads = 2; geo->sectors = 4; geo->cylinders = mddev->array_sectors / 8; return 0; } static inline int md_ioctl_valid(unsigned int cmd) { switch (cmd) { case GET_ARRAY_INFO: case GET_DISK_INFO: case RAID_VERSION: return 0; case ADD_NEW_DISK: case GET_BITMAP_FILE: case HOT_ADD_DISK: case HOT_REMOVE_DISK: case RESTART_ARRAY_RW: case RUN_ARRAY: case SET_ARRAY_INFO: case SET_BITMAP_FILE: case SET_DISK_FAULTY: case STOP_ARRAY: case STOP_ARRAY_RO: case CLUSTERED_DISK_NACK: if (!capable(CAP_SYS_ADMIN)) return -EACCES; return 0; default: return -ENOTTY; } } static bool md_ioctl_need_suspend(unsigned int cmd) { switch (cmd) { case ADD_NEW_DISK: case HOT_ADD_DISK: case HOT_REMOVE_DISK: case SET_BITMAP_FILE: case SET_ARRAY_INFO: return true; default: return false; } } static int __md_set_array_info(struct mddev *mddev, void __user *argp) { mdu_array_info_t info; int err; if (!argp) memset(&info, 0, sizeof(info)); else if (copy_from_user(&info, argp, sizeof(info))) return -EFAULT; if (mddev->pers) { err = update_array_info(mddev, &info); if (err) pr_warn("md: couldn't update array info. %d\n", err); return err; } if (!list_empty(&mddev->disks)) { pr_warn("md: array %s already has disks!\n", mdname(mddev)); return -EBUSY; } if (mddev->raid_disks) { pr_warn("md: array %s already initialised!\n", mdname(mddev)); return -EBUSY; } err = md_set_array_info(mddev, &info); if (err) pr_warn("md: couldn't set array info. %d\n", err); return err; } static int md_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { int err = 0; void __user *argp = (void __user *)arg; struct mddev *mddev = NULL; err = md_ioctl_valid(cmd); if (err) return err; /* * Commands dealing with the RAID driver but not any * particular array: */ if (cmd == RAID_VERSION) return get_version(argp); /* * Commands creating/starting a new array: */ mddev = bdev->bd_disk->private_data; /* Some actions do not requires the mutex */ switch (cmd) { case GET_ARRAY_INFO: if (!mddev->raid_disks && !mddev->external) return -ENODEV; return get_array_info(mddev, argp); case GET_DISK_INFO: if (!mddev->raid_disks && !mddev->external) return -ENODEV; return get_disk_info(mddev, argp); case SET_DISK_FAULTY: return set_disk_faulty(mddev, new_decode_dev(arg)); case GET_BITMAP_FILE: return get_bitmap_file(mddev, argp); } if (cmd == HOT_REMOVE_DISK) /* need to ensure recovery thread has run */ wait_event_interruptible_timeout(mddev->sb_wait, !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery), msecs_to_jiffies(5000)); if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { /* Need to flush page cache, and ensure no-one else opens * and writes */ err = mddev_set_closing_and_sync_blockdev(mddev, 1); if (err) return err; } if (!md_is_rdwr(mddev)) flush_work(&mddev->sync_work); err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev); if (err) { pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n", err, cmd); goto out; } if (cmd == SET_ARRAY_INFO) { err = __md_set_array_info(mddev, argp); goto unlock; } /* * Commands querying/configuring an existing array: */ /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ if ((!mddev->raid_disks && !mddev->external) && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE && cmd != GET_BITMAP_FILE) { err = -ENODEV; goto unlock; } /* * Commands even a read-only array can execute: */ switch (cmd) { case RESTART_ARRAY_RW: err = restart_array(mddev); goto unlock; case STOP_ARRAY: err = do_md_stop(mddev, 0); goto unlock; case STOP_ARRAY_RO: if (mddev->pers) err = md_set_readonly(mddev); goto unlock; case HOT_REMOVE_DISK: err = hot_remove_disk(mddev, new_decode_dev(arg)); goto unlock; case ADD_NEW_DISK: /* We can support ADD_NEW_DISK on read-only arrays * only if we are re-adding a preexisting device. * So require mddev->pers and MD_DISK_SYNC. */ if (mddev->pers) { mdu_disk_info_t info; if (copy_from_user(&info, argp, sizeof(info))) err = -EFAULT; else if (!(info.state & (1<<MD_DISK_SYNC))) /* Need to clear read-only for this */ break; else err = md_add_new_disk(mddev, &info); goto unlock; } break; } /* * The remaining ioctls are changing the state of the * superblock, so we do not allow them on read-only arrays. */ if (!md_is_rdwr(mddev) && mddev->pers) { if (mddev->ro != MD_AUTO_READ) { err = -EROFS; goto unlock; } mddev->ro = MD_RDWR; sysfs_notify_dirent_safe(mddev->sysfs_state); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); /* mddev_unlock will wake thread */ /* If a device failed while we were read-only, we * need to make sure the metadata is updated now. */ if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) { mddev_unlock(mddev); wait_event(mddev->sb_wait, !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) && !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); mddev_lock_nointr(mddev); } } switch (cmd) { case ADD_NEW_DISK: { mdu_disk_info_t info; if (copy_from_user(&info, argp, sizeof(info))) err = -EFAULT; else err = md_add_new_disk(mddev, &info); goto unlock; } case CLUSTERED_DISK_NACK: if (mddev_is_clustered(mddev)) md_cluster_ops->new_disk_ack(mddev, false); else err = -EINVAL; goto unlock; case HOT_ADD_DISK: err = hot_add_disk(mddev, new_decode_dev(arg)); goto unlock; case RUN_ARRAY: err = do_md_run(mddev); goto unlock; case SET_BITMAP_FILE: err = set_bitmap_file(mddev, (int)arg); goto unlock; default: err = -EINVAL; goto unlock; } unlock: if (mddev->hold_active == UNTIL_IOCTL && err != -EINVAL) mddev->hold_active = 0; md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); out: if (cmd == STOP_ARRAY_RO || (err && cmd == STOP_ARRAY)) clear_bit(MD_CLOSING, &mddev->flags); return err; } #ifdef CONFIG_COMPAT static int md_compat_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { switch (cmd) { case HOT_REMOVE_DISK: case HOT_ADD_DISK: case SET_DISK_FAULTY: case SET_BITMAP_FILE: /* These take in integer arg, do not convert */ break; default: arg = (unsigned long)compat_ptr(arg); break; } return md_ioctl(bdev, mode, cmd, arg); } #endif /* CONFIG_COMPAT */ static int md_set_read_only(struct block_device *bdev, bool ro) { struct mddev *mddev = bdev->bd_disk->private_data; int err; err = mddev_lock(mddev); if (err) return err; if (!mddev->raid_disks && !mddev->external) { err = -ENODEV; goto out_unlock; } /* * Transitioning to read-auto need only happen for arrays that call * md_write_start and which are not ready for writes yet. */ if (!ro && mddev->ro == MD_RDONLY && mddev->pers) { err = restart_array(mddev); if (err) goto out_unlock; mddev->ro = MD_AUTO_READ; } out_unlock: mddev_unlock(mddev); return err; } static int md_open(struct gendisk *disk, blk_mode_t mode) { struct mddev *mddev; int err; spin_lock(&all_mddevs_lock); mddev = mddev_get(disk->private_data); spin_unlock(&all_mddevs_lock); if (!mddev) return -ENODEV; err = mutex_lock_interruptible(&mddev->open_mutex); if (err) goto out; err = -ENODEV; if (test_bit(MD_CLOSING, &mddev->flags)) goto out_unlock; atomic_inc(&mddev->openers); mutex_unlock(&mddev->open_mutex); disk_check_media_change(disk); return 0; out_unlock: mutex_unlock(&mddev->open_mutex); out: mddev_put(mddev); return err; } static void md_release(struct gendisk *disk) { struct mddev *mddev = disk->private_data; BUG_ON(!mddev); atomic_dec(&mddev->openers); mddev_put(mddev); } static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing) { struct mddev *mddev = disk->private_data; unsigned int ret = 0; if (mddev->changed) ret = DISK_EVENT_MEDIA_CHANGE; mddev->changed = 0; return ret; } static void md_free_disk(struct gendisk *disk) { struct mddev *mddev = disk->private_data; mddev_free(mddev); } const struct block_device_operations md_fops = { .owner = THIS_MODULE, .submit_bio = md_submit_bio, .open = md_open, .release = md_release, .ioctl = md_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = md_compat_ioctl, #endif .getgeo = md_getgeo, .check_events = md_check_events, .set_read_only = md_set_read_only, .free_disk = md_free_disk, }; static int md_thread(void *arg) { struct md_thread *thread = arg; /* * md_thread is a 'system-thread', it's priority should be very * high. We avoid resource deadlocks individually in each * raid personality. (RAID5 does preallocation) We also use RR and * the very same RT priority as kswapd, thus we will never get * into a priority inversion deadlock. * * we definitely have to have equal or higher priority than * bdflush, otherwise bdflush will deadlock if there are too * many dirty RAID5 blocks. */ allow_signal(SIGKILL); while (!kthread_should_stop()) { /* We need to wait INTERRUPTIBLE so that * we don't add to the load-average. * That means we need to be sure no signals are * pending */ if (signal_pending(current)) flush_signals(current); wait_event_interruptible_timeout (thread->wqueue, test_bit(THREAD_WAKEUP, &thread->flags) || kthread_should_stop() || kthread_should_park(), thread->timeout); clear_bit(THREAD_WAKEUP, &thread->flags); if (kthread_should_park()) kthread_parkme(); if (!kthread_should_stop()) thread->run(thread); } return 0; } static void md_wakeup_thread_directly(struct md_thread __rcu *thread) { struct md_thread *t; rcu_read_lock(); t = rcu_dereference(thread); if (t) wake_up_process(t->tsk); rcu_read_unlock(); } void md_wakeup_thread(struct md_thread __rcu *thread) { struct md_thread *t; rcu_read_lock(); t = rcu_dereference(thread); if (t) { pr_debug("md: waking up MD thread %s.\n", t->tsk->comm); set_bit(THREAD_WAKEUP, &t->flags); wake_up(&t->wqueue); } rcu_read_unlock(); } EXPORT_SYMBOL(md_wakeup_thread); struct md_thread *md_register_thread(void (*run) (struct md_thread *), struct mddev *mddev, const char *name) { struct md_thread *thread; thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); if (!thread) return NULL; init_waitqueue_head(&thread->wqueue); thread->run = run; thread->mddev = mddev; thread->timeout = MAX_SCHEDULE_TIMEOUT; thread->tsk = kthread_run(md_thread, thread, "%s_%s", mdname(thread->mddev), name); if (IS_ERR(thread->tsk)) { kfree(thread); return NULL; } return thread; } EXPORT_SYMBOL(md_register_thread); void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp) { struct md_thread *thread = rcu_dereference_protected(*threadp, lockdep_is_held(&mddev->reconfig_mutex)); if (!thread) return; rcu_assign_pointer(*threadp, NULL); synchronize_rcu(); pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); kthread_stop(thread->tsk); kfree(thread); } EXPORT_SYMBOL(md_unregister_thread); void md_error(struct mddev *mddev, struct md_rdev *rdev) { if (!rdev || test_bit(Faulty, &rdev->flags)) return; if (!mddev->pers || !mddev->pers->error_handler) return; mddev->pers->error_handler(mddev, rdev); if (mddev->pers->level == 0) return; if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); sysfs_notify_dirent_safe(rdev->sysfs_state); set_bit(MD_RECOVERY_INTR, &mddev->recovery); if (!test_bit(MD_BROKEN, &mddev->flags)) { set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); } if (mddev->event_work.func) queue_work(md_misc_wq, &mddev->event_work); md_new_event(); } EXPORT_SYMBOL(md_error); /* seq_file implementation /proc/mdstat */ static void status_unused(struct seq_file *seq) { int i = 0; struct md_rdev *rdev; seq_printf(seq, "unused devices: "); list_for_each_entry(rdev, &pending_raid_disks, same_set) { i++; seq_printf(seq, "%pg ", rdev->bdev); } if (!i) seq_printf(seq, "<none>"); seq_printf(seq, "\n"); } static void status_personalities(struct seq_file *seq) { struct md_personality *pers; seq_puts(seq, "Personalities : "); spin_lock(&pers_lock); list_for_each_entry(pers, &pers_list, list) seq_printf(seq, "[%s] ", pers->name); spin_unlock(&pers_lock); seq_puts(seq, "\n"); } static int status_resync(struct seq_file *seq, struct mddev *mddev) { sector_t max_sectors, resync, res; unsigned long dt, db = 0; sector_t rt, curr_mark_cnt, resync_mark_cnt; int scale, recovery_active; unsigned int per_milli; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) max_sectors = mddev->resync_max_sectors; else max_sectors = mddev->dev_sectors; resync = mddev->curr_resync; if (resync < MD_RESYNC_ACTIVE) { if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) /* Still cleaning up */ resync = max_sectors; } else if (resync > max_sectors) { resync = max_sectors; } else { res = atomic_read(&mddev->recovery_active); /* * Resync has started, but the subtraction has overflowed or * yielded one of the special values. Force it to active to * ensure the status reports an active resync. */ if (resync < res || resync - res < MD_RESYNC_ACTIVE) resync = MD_RESYNC_ACTIVE; else resync -= res; } if (resync == MD_RESYNC_NONE) { if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) { struct md_rdev *rdev; rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags) && rdev->recovery_offset != MaxSector && rdev->recovery_offset) { seq_printf(seq, "\trecover=REMOTE"); return 1; } if (mddev->reshape_position != MaxSector) seq_printf(seq, "\treshape=REMOTE"); else seq_printf(seq, "\tresync=REMOTE"); return 1; } if (mddev->recovery_cp < MaxSector) { seq_printf(seq, "\tresync=PENDING"); return 1; } return 0; } if (resync < MD_RESYNC_ACTIVE) { seq_printf(seq, "\tresync=DELAYED"); return 1; } WARN_ON(max_sectors == 0); /* Pick 'scale' such that (resync>>scale)*1000 will fit * in a sector_t, and (max_sectors>>scale) will fit in a * u32, as those are the requirements for sector_div. * Thus 'scale' must be at least 10 */ scale = 10; if (sizeof(sector_t) > sizeof(unsigned long)) { while ( max_sectors/2 > (1ULL<<(scale+32))) scale++; } res = (resync>>scale)*1000; sector_div(res, (u32)((max_sectors>>scale)+1)); per_milli = res; { int i, x = per_milli/50, y = 20-x; seq_printf(seq, "["); for (i = 0; i < x; i++) seq_printf(seq, "="); seq_printf(seq, ">"); for (i = 0; i < y; i++) seq_printf(seq, "."); seq_printf(seq, "] "); } seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? "reshape" : (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? "check" : (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? "resync" : "recovery"))), per_milli/10, per_milli % 10, (unsigned long long) resync/2, (unsigned long long) max_sectors/2); /* * dt: time from mark until now * db: blocks written from mark until now * rt: remaining time * * rt is a sector_t, which is always 64bit now. We are keeping * the original algorithm, but it is not really necessary. * * Original algorithm: * So we divide before multiply in case it is 32bit and close * to the limit. * We scale the divisor (db) by 32 to avoid losing precision * near the end of resync when the number of remaining sectors * is close to 'db'. * We then divide rt by 32 after multiplying by db to compensate. * The '+1' avoids division by zero if db is very small. */ dt = ((jiffies - mddev->resync_mark) / HZ); if (!dt) dt++; curr_mark_cnt = mddev->curr_mark_cnt; recovery_active = atomic_read(&mddev->recovery_active); resync_mark_cnt = mddev->resync_mark_cnt; if (curr_mark_cnt >= (recovery_active + resync_mark_cnt)) db = curr_mark_cnt - (recovery_active + resync_mark_cnt); rt = max_sectors - resync; /* number of remaining sectors */ rt = div64_u64(rt, db/32+1); rt *= dt; rt >>= 5; seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, ((unsigned long)rt % 60)/6); seq_printf(seq, " speed=%ldK/sec", db/2/dt); return 1; } static void *md_seq_start(struct seq_file *seq, loff_t *pos) __acquires(&all_mddevs_lock) { seq->poll_event = atomic_read(&md_event_count); spin_lock(&all_mddevs_lock); return seq_list_start_head(&all_mddevs, *pos); } static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) { return seq_list_next(v, &all_mddevs, pos); } static void md_seq_stop(struct seq_file *seq, void *v) __releases(&all_mddevs_lock) { spin_unlock(&all_mddevs_lock); } static int md_seq_show(struct seq_file *seq, void *v) { struct mddev *mddev; sector_t sectors; struct md_rdev *rdev; if (v == &all_mddevs) { status_personalities(seq); if (list_empty(&all_mddevs)) status_unused(seq); return 0; } mddev = list_entry(v, struct mddev, all_mddevs); if (!mddev_get(mddev)) return 0; spin_unlock(&all_mddevs_lock); spin_lock(&mddev->lock); if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { seq_printf(seq, "%s : %sactive", mdname(mddev), mddev->pers ? "" : "in"); if (mddev->pers) { if (mddev->ro == MD_RDONLY) seq_printf(seq, " (read-only)"); if (mddev->ro == MD_AUTO_READ) seq_printf(seq, " (auto-read-only)"); seq_printf(seq, " %s", mddev->pers->name); } sectors = 0; rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) { seq_printf(seq, " %pg[%d]", rdev->bdev, rdev->desc_nr); if (test_bit(WriteMostly, &rdev->flags)) seq_printf(seq, "(W)"); if (test_bit(Journal, &rdev->flags)) seq_printf(seq, "(J)"); if (test_bit(Faulty, &rdev->flags)) { seq_printf(seq, "(F)"); continue; } if (rdev->raid_disk < 0) seq_printf(seq, "(S)"); /* spare */ if (test_bit(Replacement, &rdev->flags)) seq_printf(seq, "(R)"); sectors += rdev->sectors; } rcu_read_unlock(); if (!list_empty(&mddev->disks)) { if (mddev->pers) seq_printf(seq, "\n %llu blocks", (unsigned long long) mddev->array_sectors / 2); else seq_printf(seq, "\n %llu blocks", (unsigned long long)sectors / 2); } if (mddev->persistent) { if (mddev->major_version != 0 || mddev->minor_version != 90) { seq_printf(seq," super %d.%d", mddev->major_version, mddev->minor_version); } } else if (mddev->external) seq_printf(seq, " super external:%s", mddev->metadata_type); else seq_printf(seq, " super non-persistent"); if (mddev->pers) { mddev->pers->status(seq, mddev); seq_printf(seq, "\n "); if (mddev->pers->sync_request) { if (status_resync(seq, mddev)) seq_printf(seq, "\n "); } } else seq_printf(seq, "\n "); md_bitmap_status(seq, mddev->bitmap); seq_printf(seq, "\n"); } spin_unlock(&mddev->lock); spin_lock(&all_mddevs_lock); if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs)) status_unused(seq); if (atomic_dec_and_test(&mddev->active)) __mddev_put(mddev); return 0; } static const struct seq_operations md_seq_ops = { .start = md_seq_start, .next = md_seq_next, .stop = md_seq_stop, .show = md_seq_show, }; static int md_seq_open(struct inode *inode, struct file *file) { struct seq_file *seq; int error; error = seq_open(file, &md_seq_ops); if (error) return error; seq = file->private_data; seq->poll_event = atomic_read(&md_event_count); return error; } static int md_unloading; static __poll_t mdstat_poll(struct file *filp, poll_table *wait) { struct seq_file *seq = filp->private_data; __poll_t mask; if (md_unloading) return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; poll_wait(filp, &md_event_waiters, wait); /* always allow read */ mask = EPOLLIN | EPOLLRDNORM; if (seq->poll_event != atomic_read(&md_event_count)) mask |= EPOLLERR | EPOLLPRI; return mask; } static const struct proc_ops mdstat_proc_ops = { .proc_open = md_seq_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = seq_release, .proc_poll = mdstat_poll, }; int register_md_personality(struct md_personality *p) { pr_debug("md: %s personality registered for level %d\n", p->name, p->level); spin_lock(&pers_lock); list_add_tail(&p->list, &pers_list); spin_unlock(&pers_lock); return 0; } EXPORT_SYMBOL(register_md_personality); int unregister_md_personality(struct md_personality *p) { pr_debug("md: %s personality unregistered\n", p->name); spin_lock(&pers_lock); list_del_init(&p->list); spin_unlock(&pers_lock); return 0; } EXPORT_SYMBOL(unregister_md_personality); int register_md_cluster_operations(struct md_cluster_operations *ops, struct module *module) { int ret = 0; spin_lock(&pers_lock); if (md_cluster_ops != NULL) ret = -EALREADY; else { md_cluster_ops = ops; md_cluster_mod = module; } spin_unlock(&pers_lock); return ret; } EXPORT_SYMBOL(register_md_cluster_operations); int unregister_md_cluster_operations(void) { spin_lock(&pers_lock); md_cluster_ops = NULL; spin_unlock(&pers_lock); return 0; } EXPORT_SYMBOL(unregister_md_cluster_operations); int md_setup_cluster(struct mddev *mddev, int nodes) { int ret; if (!md_cluster_ops) request_module("md-cluster"); spin_lock(&pers_lock); /* ensure module won't be unloaded */ if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { pr_warn("can't find md-cluster module or get its reference.\n"); spin_unlock(&pers_lock); return -ENOENT; } spin_unlock(&pers_lock); ret = md_cluster_ops->join(mddev, nodes); if (!ret) mddev->safemode_delay = 0; return ret; } void md_cluster_stop(struct mddev *mddev) { if (!md_cluster_ops) return; md_cluster_ops->leave(mddev); module_put(md_cluster_mod); } static int is_mddev_idle(struct mddev *mddev, int init) { struct md_rdev *rdev; int idle; int curr_events; idle = 1; rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) { struct gendisk *disk = rdev->bdev->bd_disk; curr_events = (int)part_stat_read_accum(disk->part0, sectors) - atomic_read(&disk->sync_io); /* sync IO will cause sync_io to increase before the disk_stats * as sync_io is counted when a request starts, and * disk_stats is counted when it completes. * So resync activity will cause curr_events to be smaller than * when there was no such activity. * non-sync IO will cause disk_stat to increase without * increasing sync_io so curr_events will (eventually) * be larger than it was before. Once it becomes * substantially larger, the test below will cause * the array to appear non-idle, and resync will slow * down. * If there is a lot of outstanding resync activity when * we set last_event to curr_events, then all that activity * completing might cause the array to appear non-idle * and resync will be slowed down even though there might * not have been non-resync activity. This will only * happen once though. 'last_events' will soon reflect * the state where there is little or no outstanding * resync requests, and further resync activity will * always make curr_events less than last_events. * */ if (init || curr_events - rdev->last_events > 64) { rdev->last_events = curr_events; idle = 0; } } rcu_read_unlock(); return idle; } void md_done_sync(struct mddev *mddev, int blocks, int ok) { /* another "blocks" (512byte) blocks have been synced */ atomic_sub(blocks, &mddev->recovery_active); wake_up(&mddev->recovery_wait); if (!ok) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_RECOVERY_ERROR, &mddev->recovery); md_wakeup_thread(mddev->thread); // stop recovery, signal do_sync .... } } EXPORT_SYMBOL(md_done_sync); /* md_write_start(mddev, bi) * If we need to update some array metadata (e.g. 'active' flag * in superblock) before writing, schedule a superblock update * and wait for it to complete. * A return value of 'false' means that the write wasn't recorded * and cannot proceed as the array is being suspend. */ bool md_write_start(struct mddev *mddev, struct bio *bi) { int did_change = 0; if (bio_data_dir(bi) != WRITE) return true; BUG_ON(mddev->ro == MD_RDONLY); if (mddev->ro == MD_AUTO_READ) { /* need to switch to read/write */ flush_work(&mddev->sync_work); mddev->ro = MD_RDWR; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); did_change = 1; } rcu_read_lock(); percpu_ref_get(&mddev->writes_pending); smp_mb(); /* Match smp_mb in set_in_sync() */ if (mddev->safemode == 1) mddev->safemode = 0; /* sync_checkers is always 0 when writes_pending is in per-cpu mode */ if (mddev->in_sync || mddev->sync_checkers) { spin_lock(&mddev->lock); if (mddev->in_sync) { mddev->in_sync = 0; set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); md_wakeup_thread(mddev->thread); did_change = 1; } spin_unlock(&mddev->lock); } rcu_read_unlock(); if (did_change) sysfs_notify_dirent_safe(mddev->sysfs_state); if (!mddev->has_superblocks) return true; wait_event(mddev->sb_wait, !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) || is_md_suspended(mddev)); if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { percpu_ref_put(&mddev->writes_pending); return false; } return true; } EXPORT_SYMBOL(md_write_start); /* md_write_inc can only be called when md_write_start() has * already been called at least once of the current request. * It increments the counter and is useful when a single request * is split into several parts. Each part causes an increment and * so needs a matching md_write_end(). * Unlike md_write_start(), it is safe to call md_write_inc() inside * a spinlocked region. */ void md_write_inc(struct mddev *mddev, struct bio *bi) { if (bio_data_dir(bi) != WRITE) return; WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev)); percpu_ref_get(&mddev->writes_pending); } EXPORT_SYMBOL(md_write_inc); void md_write_end(struct mddev *mddev) { percpu_ref_put(&mddev->writes_pending); if (mddev->safemode == 2) md_wakeup_thread(mddev->thread); else if (mddev->safemode_delay) /* The roundup() ensures this only performs locking once * every ->safemode_delay jiffies */ mod_timer(&mddev->safemode_timer, roundup(jiffies, mddev->safemode_delay) + mddev->safemode_delay); } EXPORT_SYMBOL(md_write_end); /* This is used by raid0 and raid10 */ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, struct bio *bio, sector_t start, sector_t size) { struct bio *discard_bio = NULL; if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO, &discard_bio) || !discard_bio) return; bio_chain(discard_bio, bio); bio_clone_blkg_association(discard_bio, bio); mddev_trace_remap(mddev, discard_bio, bio->bi_iter.bi_sector); submit_bio_noacct(discard_bio); } EXPORT_SYMBOL_GPL(md_submit_discard_bio); static void md_end_clone_io(struct bio *bio) { struct md_io_clone *md_io_clone = bio->bi_private; struct bio *orig_bio = md_io_clone->orig_bio; struct mddev *mddev = md_io_clone->mddev; if (bio->bi_status && !orig_bio->bi_status) orig_bio->bi_status = bio->bi_status; if (md_io_clone->start_time) bio_end_io_acct(orig_bio, md_io_clone->start_time); bio_put(bio); bio_endio(orig_bio); percpu_ref_put(&mddev->active_io); } static void md_clone_bio(struct mddev *mddev, struct bio **bio) { struct block_device *bdev = (*bio)->bi_bdev; struct md_io_clone *md_io_clone; struct bio *clone = bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set); md_io_clone = container_of(clone, struct md_io_clone, bio_clone); md_io_clone->orig_bio = *bio; md_io_clone->mddev = mddev; if (blk_queue_io_stat(bdev->bd_disk->queue)) md_io_clone->start_time = bio_start_io_acct(*bio); clone->bi_end_io = md_end_clone_io; clone->bi_private = md_io_clone; *bio = clone; } void md_account_bio(struct mddev *mddev, struct bio **bio) { percpu_ref_get(&mddev->active_io); md_clone_bio(mddev, bio); } EXPORT_SYMBOL_GPL(md_account_bio); void md_free_cloned_bio(struct bio *bio) { struct md_io_clone *md_io_clone = bio->bi_private; struct bio *orig_bio = md_io_clone->orig_bio; struct mddev *mddev = md_io_clone->mddev; if (bio->bi_status && !orig_bio->bi_status) orig_bio->bi_status = bio->bi_status; if (md_io_clone->start_time) bio_end_io_acct(orig_bio, md_io_clone->start_time); bio_put(bio); percpu_ref_put(&mddev->active_io); } EXPORT_SYMBOL_GPL(md_free_cloned_bio); /* md_allow_write(mddev) * Calling this ensures that the array is marked 'active' so that writes * may proceed without blocking. It is important to call this before * attempting a GFP_KERNEL allocation while holding the mddev lock. * Must be called with mddev_lock held. */ void md_allow_write(struct mddev *mddev) { if (!mddev->pers) return; if (!md_is_rdwr(mddev)) return; if (!mddev->pers->sync_request) return; spin_lock(&mddev->lock); if (mddev->in_sync) { mddev->in_sync = 0; set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); if (mddev->safemode_delay && mddev->safemode == 0) mddev->safemode = 1; spin_unlock(&mddev->lock); md_update_sb(mddev, 0); sysfs_notify_dirent_safe(mddev->sysfs_state); /* wait for the dirty state to be recorded in the metadata */ wait_event(mddev->sb_wait, !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); } else spin_unlock(&mddev->lock); } EXPORT_SYMBOL_GPL(md_allow_write); #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) #define UPDATE_FREQUENCY (5*60*HZ) void md_do_sync(struct md_thread *thread) { struct mddev *mddev = thread->mddev; struct mddev *mddev2; unsigned int currspeed = 0, window; sector_t max_sectors,j, io_sectors, recovery_done; unsigned long mark[SYNC_MARKS]; unsigned long update_time; sector_t mark_cnt[SYNC_MARKS]; int last_mark,m; sector_t last_check; int skipped = 0; struct md_rdev *rdev; char *desc, *action = NULL; struct blk_plug plug; int ret; /* just incase thread restarts... */ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) return; if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) goto skip; if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) || !md_is_rdwr(mddev)) {/* never try to sync a read-only array */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); goto skip; } if (mddev_is_clustered(mddev)) { ret = md_cluster_ops->resync_start(mddev); if (ret) goto skip; set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags); if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) && ((unsigned long long)mddev->curr_resync_completed < (unsigned long long)mddev->resync_max_sectors)) goto skip; } if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { desc = "data-check"; action = "check"; } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { desc = "requested-resync"; action = "repair"; } else desc = "resync"; } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) desc = "reshape"; else desc = "recovery"; mddev->last_sync_action = action ?: desc; /* * Before starting a resync we must have set curr_resync to * 2, and then checked that every "conflicting" array has curr_resync * less than ours. When we find one that is the same or higher * we wait on resync_wait. To avoid deadlock, we reduce curr_resync * to 1 if we choose to yield (based arbitrarily on address of mddev structure). * This will mean we have to start checking from the beginning again. * */ do { int mddev2_minor = -1; mddev->curr_resync = MD_RESYNC_DELAYED; try_again: if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) goto skip; spin_lock(&all_mddevs_lock); list_for_each_entry(mddev2, &all_mddevs, all_mddevs) { if (test_bit(MD_DELETED, &mddev2->flags)) continue; if (mddev2 == mddev) continue; if (!mddev->parallel_resync && mddev2->curr_resync && match_mddev_units(mddev, mddev2)) { DEFINE_WAIT(wq); if (mddev < mddev2 && mddev->curr_resync == MD_RESYNC_DELAYED) { /* arbitrarily yield */ mddev->curr_resync = MD_RESYNC_YIELDED; wake_up(&resync_wait); } if (mddev > mddev2 && mddev->curr_resync == MD_RESYNC_YIELDED) /* no need to wait here, we can wait the next * time 'round when curr_resync == 2 */ continue; /* We need to wait 'interruptible' so as not to * contribute to the load average, and not to * be caught by 'softlockup' */ prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && mddev2->curr_resync >= mddev->curr_resync) { if (mddev2_minor != mddev2->md_minor) { mddev2_minor = mddev2->md_minor; pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n", desc, mdname(mddev), mdname(mddev2)); } spin_unlock(&all_mddevs_lock); if (signal_pending(current)) flush_signals(current); schedule(); finish_wait(&resync_wait, &wq); goto try_again; } finish_wait(&resync_wait, &wq); } } spin_unlock(&all_mddevs_lock); } while (mddev->curr_resync < MD_RESYNC_DELAYED); j = 0; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { /* resync follows the size requested by the personality, * which defaults to physical size, but can be virtual size */ max_sectors = mddev->resync_max_sectors; atomic64_set(&mddev->resync_mismatches, 0); /* we don't use the checkpoint if there's a bitmap */ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) j = mddev->resync_min; else if (!mddev->bitmap) j = mddev->recovery_cp; } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { max_sectors = mddev->resync_max_sectors; /* * If the original node aborts reshaping then we continue the * reshaping, so set j again to avoid restart reshape from the * first beginning */ if (mddev_is_clustered(mddev) && mddev->reshape_position != MaxSector) j = mddev->reshape_position; } else { /* recovery follows the physical size of devices */ max_sectors = mddev->dev_sectors; j = MaxSector; rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && rdev->recovery_offset < j) j = rdev->recovery_offset; rcu_read_unlock(); /* If there is a bitmap, we need to make sure all * writes that started before we added a spare * complete before we start doing a recovery. * Otherwise the write might complete and (via * bitmap_endwrite) set a bit in the bitmap after the * recovery has checked that bit and skipped that * region. */ if (mddev->bitmap) { mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } } pr_info("md: %s of RAID array %s\n", desc, mdname(mddev)); pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev)); pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n", speed_max(mddev), desc); is_mddev_idle(mddev, 1); /* this initializes IO event counters */ io_sectors = 0; for (m = 0; m < SYNC_MARKS; m++) { mark[m] = jiffies; mark_cnt[m] = io_sectors; } last_mark = 0; mddev->resync_mark = mark[last_mark]; mddev->resync_mark_cnt = mark_cnt[last_mark]; /* * Tune reconstruction: */ window = 32 * (PAGE_SIZE / 512); pr_debug("md: using %dk window, over a total of %lluk.\n", window/2, (unsigned long long)max_sectors/2); atomic_set(&mddev->recovery_active, 0); last_check = 0; if (j >= MD_RESYNC_ACTIVE) { pr_debug("md: resuming %s of %s from checkpoint.\n", desc, mdname(mddev)); mddev->curr_resync = j; } else mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */ mddev->curr_resync_completed = j; sysfs_notify_dirent_safe(mddev->sysfs_completed); md_new_event(); update_time = jiffies; blk_start_plug(&plug); while (j < max_sectors) { sector_t sectors; skipped = 0; if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && ((mddev->curr_resync > mddev->curr_resync_completed && (mddev->curr_resync - mddev->curr_resync_completed) > (max_sectors >> 4)) || time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || (j - mddev->curr_resync_completed)*2 >= mddev->resync_max - mddev->curr_resync_completed || mddev->curr_resync_completed > mddev->resync_max )) { /* time to update curr_resync_completed */ wait_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active) == 0); mddev->curr_resync_completed = j; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && j > mddev->recovery_cp) mddev->recovery_cp = j; update_time = jiffies; set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); sysfs_notify_dirent_safe(mddev->sysfs_completed); } while (j >= mddev->resync_max && !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { /* As this condition is controlled by user-space, * we can block indefinitely, so use '_interruptible' * to avoid triggering warnings. */ flush_signals(current); /* just in case */ wait_event_interruptible(mddev->recovery_wait, mddev->resync_max > j || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); } if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) break; sectors = mddev->pers->sync_request(mddev, j, &skipped); if (sectors == 0) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); break; } if (!skipped) { /* actual IO requested */ io_sectors += sectors; atomic_add(sectors, &mddev->recovery_active); } if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) break; j += sectors; if (j > max_sectors) /* when skipping, extra large numbers can be returned. */ j = max_sectors; if (j >= MD_RESYNC_ACTIVE) mddev->curr_resync = j; mddev->curr_mark_cnt = io_sectors; if (last_check == 0) /* this is the earliest that rebuild will be * visible in /proc/mdstat */ md_new_event(); if (last_check + window > io_sectors || j == max_sectors) continue; last_check = io_sectors; repeat: if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { /* step marks */ int next = (last_mark+1) % SYNC_MARKS; mddev->resync_mark = mark[next]; mddev->resync_mark_cnt = mark_cnt[next]; mark[next] = jiffies; mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); last_mark = next; } if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) break; /* * this loop exits only if either when we are slower than * the 'hard' speed limit, or the system was IO-idle for * a jiffy. * the system might be non-idle CPU-wise, but we only care * about not overloading the IO subsystem. (things like an * e2fsck being done on the RAID array should execute fast) */ cond_resched(); recovery_done = io_sectors - atomic_read(&mddev->recovery_active); currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2 /((jiffies-mddev->resync_mark)/HZ +1) +1; if (currspeed > speed_min(mddev)) { if (currspeed > speed_max(mddev)) { msleep(500); goto repeat; } if (!is_mddev_idle(mddev, 0)) { /* * Give other IO more of a chance. * The faster the devices, the less we wait. */ wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); } } } pr_info("md: %s: %s %s.\n",mdname(mddev), desc, test_bit(MD_RECOVERY_INTR, &mddev->recovery) ? "interrupted" : "done"); /* * this also signals 'finished resyncing' to md_stop */ blk_finish_plug(&plug); wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && mddev->curr_resync >= MD_RESYNC_ACTIVE) { mddev->curr_resync_completed = mddev->curr_resync; sysfs_notify_dirent_safe(mddev->sysfs_completed); } mddev->pers->sync_request(mddev, max_sectors, &skipped); if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && mddev->curr_resync > MD_RESYNC_ACTIVE) { if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { if (mddev->curr_resync >= mddev->recovery_cp) { pr_debug("md: checkpointing %s of %s.\n", desc, mdname(mddev)); if (test_bit(MD_RECOVERY_ERROR, &mddev->recovery)) mddev->recovery_cp = mddev->curr_resync_completed; else mddev->recovery_cp = mddev->curr_resync; } } else mddev->recovery_cp = MaxSector; } else { if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) mddev->curr_resync = MaxSector; if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && mddev->delta_disks >= 0 && !test_bit(Journal, &rdev->flags) && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && rdev->recovery_offset < mddev->curr_resync) rdev->recovery_offset = mddev->curr_resync; rcu_read_unlock(); } } } skip: /* set CHANGE_PENDING here since maybe another update is needed, * so other nodes are informed. It should be harmless for normal * raid */ set_mask_bits(&mddev->sb_flags, 0, BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && mddev->delta_disks > 0 && mddev->pers->finish_reshape && mddev->pers->size && !mddev_is_dm(mddev)) { mddev_lock_nointr(mddev); md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); mddev_unlock(mddev); if (!mddev_is_clustered(mddev)) set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); } spin_lock(&mddev->lock); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { /* We completed so min/max setting can be forgotten if used. */ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) mddev->resync_min = 0; mddev->resync_max = MaxSector; } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) mddev->resync_min = mddev->curr_resync_completed; set_bit(MD_RECOVERY_DONE, &mddev->recovery); mddev->curr_resync = MD_RESYNC_NONE; spin_unlock(&mddev->lock); wake_up(&resync_wait); md_wakeup_thread(mddev->thread); return; } EXPORT_SYMBOL_GPL(md_do_sync); static bool rdev_removeable(struct md_rdev *rdev) { /* rdev is not used. */ if (rdev->raid_disk < 0) return false; /* There are still inflight io, don't remove this rdev. */ if (atomic_read(&rdev->nr_pending)) return false; /* * An error occurred but has not yet been acknowledged by the metadata * handler, don't remove this rdev. */ if (test_bit(Blocked, &rdev->flags)) return false; /* Fautly rdev is not used, it's safe to remove it. */ if (test_bit(Faulty, &rdev->flags)) return true; /* Journal disk can only be removed if it's faulty. */ if (test_bit(Journal, &rdev->flags)) return false; /* * 'In_sync' is cleared while 'raid_disk' is valid, which means * replacement has just become active from pers->spare_active(), and * then pers->hot_remove_disk() will replace this rdev with replacement. */ if (!test_bit(In_sync, &rdev->flags)) return true; return false; } static bool rdev_is_spare(struct md_rdev *rdev) { return !test_bit(Candidate, &rdev->flags) && rdev->raid_disk >= 0 && !test_bit(In_sync, &rdev->flags) && !test_bit(Journal, &rdev->flags) && !test_bit(Faulty, &rdev->flags); } static bool rdev_addable(struct md_rdev *rdev) { /* rdev is already used, don't add it again. */ if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 || test_bit(Faulty, &rdev->flags)) return false; /* Allow to add journal disk. */ if (test_bit(Journal, &rdev->flags)) return true; /* Allow to add if array is read-write. */ if (md_is_rdwr(rdev->mddev)) return true; /* * For read-only array, only allow to readd a rdev. And if bitmap is * used, don't allow to readd a rdev that is too old. */ if (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags)) return true; return false; } static bool md_spares_need_change(struct mddev *mddev) { struct md_rdev *rdev; rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) { if (rdev_removeable(rdev) || rdev_addable(rdev)) { rcu_read_unlock(); return true; } } rcu_read_unlock(); return false; } static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this) { struct md_rdev *rdev; int spares = 0; int removed = 0; if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) /* Mustn't remove devices when resync thread is running */ return 0; rdev_for_each(rdev, mddev) { if ((this == NULL || rdev == this) && rdev_removeable(rdev) && !mddev->pers->hot_remove_disk(mddev, rdev)) { sysfs_unlink_rdev(mddev, rdev); rdev->saved_raid_disk = rdev->raid_disk; rdev->raid_disk = -1; removed++; } } if (removed && mddev->kobj.sd) sysfs_notify_dirent_safe(mddev->sysfs_degraded); if (this && removed) goto no_add; rdev_for_each(rdev, mddev) { if (this && this != rdev) continue; if (rdev_is_spare(rdev)) spares++; if (!rdev_addable(rdev)) continue; if (!test_bit(Journal, &rdev->flags)) rdev->recovery_offset = 0; if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { /* failure here is OK */ sysfs_link_rdev(mddev, rdev); if (!test_bit(Journal, &rdev->flags)) spares++; md_new_event(); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); } } no_add: if (removed) set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); return spares; } static bool md_choose_sync_action(struct mddev *mddev, int *spares) { /* Check if reshape is in progress first. */ if (mddev->reshape_position != MaxSector) { if (mddev->pers->check_reshape == NULL || mddev->pers->check_reshape(mddev) != 0) return false; set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); return true; } /* * Remove any failed drives, then add spares if possible. Spares are * also removed and re-added, to allow the personality to fail the * re-add. */ *spares = remove_and_add_spares(mddev, NULL); if (*spares) { clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); /* Start new recovery. */ set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); return true; } /* Check if recovery is in progress. */ if (mddev->recovery_cp < MaxSector) { set_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); return true; } /* Delay to choose resync/check/repair in md_do_sync(). */ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) return true; /* Nothing to be done */ return false; } static void md_start_sync(struct work_struct *ws) { struct mddev *mddev = container_of(ws, struct mddev, sync_work); int spares = 0; bool suspend = false; char *name; /* * If reshape is still in progress, spares won't be added or removed * from conf until reshape is done. */ if (mddev->reshape_position == MaxSector && md_spares_need_change(mddev)) { suspend = true; mddev_suspend(mddev, false); } mddev_lock_nointr(mddev); if (!md_is_rdwr(mddev)) { /* * On a read-only array we can: * - remove failed devices * - add already-in_sync devices if the array itself is in-sync. * As we only add devices that are already in-sync, we can * activate the spares immediately. */ remove_and_add_spares(mddev, NULL); goto not_running; } if (!md_choose_sync_action(mddev, &spares)) goto not_running; if (!mddev->pers->sync_request) goto not_running; /* * We are adding a device or devices to an array which has the bitmap * stored on all devices. So make sure all bitmap pages get written. */ if (spares) md_bitmap_write_all(mddev->bitmap); name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ? "reshape" : "resync"; rcu_assign_pointer(mddev->sync_thread, md_register_thread(md_do_sync, mddev, name)); if (!mddev->sync_thread) { pr_warn("%s: could not start resync thread...\n", mdname(mddev)); /* leave the spares where they are, it shouldn't hurt */ goto not_running; } mddev_unlock(mddev); /* * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should * not set it again. Otherwise, we may cause issue like this one: * https://bugzilla.kernel.org/show_bug.cgi?id=218200 * Therefore, use __mddev_resume(mddev, false). */ if (suspend) __mddev_resume(mddev, false); md_wakeup_thread(mddev->sync_thread); sysfs_notify_dirent_safe(mddev->sysfs_action); md_new_event(); return; not_running: clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); mddev_unlock(mddev); /* * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should * not set it again. Otherwise, we may cause issue like this one: * https://bugzilla.kernel.org/show_bug.cgi?id=218200 * Therefore, use __mddev_resume(mddev, false). */ if (suspend) __mddev_resume(mddev, false); wake_up(&resync_wait); if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && mddev->sysfs_action) sysfs_notify_dirent_safe(mddev->sysfs_action); } static void unregister_sync_thread(struct mddev *mddev) { if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { /* resync/recovery still happening */ clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); return; } if (WARN_ON_ONCE(!mddev->sync_thread)) return; md_reap_sync_thread(mddev); } /* * This routine is regularly called by all per-raid-array threads to * deal with generic issues like resync and super-block update. * Raid personalities that don't have a thread (linear/raid0) do not * need this as they never do any recovery or update the superblock. * * It does not do any resync itself, but rather "forks" off other threads * to do that as needed. * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in * "->recovery" and create a thread at ->sync_thread. * When the thread finishes it sets MD_RECOVERY_DONE * and wakeups up this thread which will reap the thread and finish up. * This thread also removes any faulty devices (with nr_pending == 0). * * The overall approach is: * 1/ if the superblock needs updating, update it. * 2/ If a recovery thread is running, don't do anything else. * 3/ If recovery has finished, clean up, possibly marking spares active. * 4/ If there are any faulty devices, remove them. * 5/ If array is degraded, try to add spares devices * 6/ If array has spares or is not in-sync, start a resync thread. */ void md_check_recovery(struct mddev *mddev) { if (mddev->bitmap) md_bitmap_daemon_work(mddev); if (signal_pending(current)) { if (mddev->pers->sync_request && !mddev->external) { pr_debug("md: %s in immediate safe mode\n", mdname(mddev)); mddev->safemode = 2; } flush_signals(current); } if (!md_is_rdwr(mddev) && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) return; if ( ! ( (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || test_bit(MD_RECOVERY_DONE, &mddev->recovery) || (mddev->external == 0 && mddev->safemode == 1) || (mddev->safemode == 2 && !mddev->in_sync && mddev->recovery_cp == MaxSector) )) return; if (mddev_trylock(mddev)) { bool try_set_sync = mddev->safemode != 0; if (!mddev->external && mddev->safemode == 1) mddev->safemode = 0; if (!md_is_rdwr(mddev)) { struct md_rdev *rdev; if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { unregister_sync_thread(mddev); goto unlock; } if (!mddev->external && mddev->in_sync) /* * 'Blocked' flag not needed as failed devices * will be recorded if array switched to read/write. * Leaving it set will prevent the device * from being removed. */ rdev_for_each(rdev, mddev) clear_bit(Blocked, &rdev->flags); /* * There is no thread, but we need to call * ->spare_active and clear saved_raid_disk */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_reap_sync_thread(mddev); /* * Let md_start_sync() to remove and add rdevs to the * array. */ if (md_spares_need_change(mddev)) { set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); queue_work(md_misc_wq, &mddev->sync_work); } clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); goto unlock; } if (mddev_is_clustered(mddev)) { struct md_rdev *rdev, *tmp; /* kick the device if another node issued a * remove disk. */ rdev_for_each_safe(rdev, tmp, mddev) { if (test_and_clear_bit(ClusterRemove, &rdev->flags) && rdev->raid_disk < 0) md_kick_rdev_from_array(rdev); } } if (try_set_sync && !mddev->external && !mddev->in_sync) { spin_lock(&mddev->lock); set_in_sync(mddev); spin_unlock(&mddev->lock); } if (mddev->sb_flags) md_update_sb(mddev, 0); /* * Never start a new sync thread if MD_RECOVERY_RUNNING is * still set. */ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { unregister_sync_thread(mddev); goto unlock; } /* Set RUNNING before clearing NEEDED to avoid * any transients in the value of "sync_action". */ mddev->curr_resync_completed = 0; spin_lock(&mddev->lock); set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); spin_unlock(&mddev->lock); /* Clear some bits that don't mean anything, but * might be left set */ clear_bit(MD_RECOVERY_INTR, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); if (test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { queue_work(md_misc_wq, &mddev->sync_work); } else { clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); wake_up(&resync_wait); } unlock: wake_up(&mddev->sb_wait); mddev_unlock(mddev); } } EXPORT_SYMBOL(md_check_recovery); void md_reap_sync_thread(struct mddev *mddev) { struct md_rdev *rdev; sector_t old_dev_sectors = mddev->dev_sectors; bool is_reshaped = false; /* resync has finished, collect result */ md_unregister_thread(mddev, &mddev->sync_thread); atomic_inc(&mddev->sync_seq); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && mddev->degraded != mddev->raid_disks) { /* success...*/ /* activate any spares */ if (mddev->pers->spare_active(mddev)) { sysfs_notify_dirent_safe(mddev->sysfs_degraded); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); } } if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && mddev->pers->finish_reshape) { mddev->pers->finish_reshape(mddev); if (mddev_is_clustered(mddev)) is_reshaped = true; } /* If array is no-longer degraded, then any saved_raid_disk * information must be scrapped. */ if (!mddev->degraded) rdev_for_each(rdev, mddev) rdev->saved_raid_disk = -1; md_update_sb(mddev, 1); /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by * clustered raid */ if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) md_cluster_ops->resync_finish(mddev); clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); /* * We call md_cluster_ops->update_size here because sync_size could * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, * so it is time to update size across cluster. */ if (mddev_is_clustered(mddev) && is_reshaped && !test_bit(MD_CLOSING, &mddev->flags)) md_cluster_ops->update_size(mddev, old_dev_sectors); /* flag recovery needed just to double check */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); sysfs_notify_dirent_safe(mddev->sysfs_completed); sysfs_notify_dirent_safe(mddev->sysfs_action); md_new_event(); if (mddev->event_work.func) queue_work(md_misc_wq, &mddev->event_work); wake_up(&resync_wait); } EXPORT_SYMBOL(md_reap_sync_thread); void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) { sysfs_notify_dirent_safe(rdev->sysfs_state); wait_event_timeout(rdev->blocked_wait, !test_bit(Blocked, &rdev->flags) && !test_bit(BlockedBadBlocks, &rdev->flags), msecs_to_jiffies(5000)); rdev_dec_pending(rdev, mddev); } EXPORT_SYMBOL(md_wait_for_blocked_rdev); void md_finish_reshape(struct mddev *mddev) { /* called be personality module when reshape completes. */ struct md_rdev *rdev; rdev_for_each(rdev, mddev) { if (rdev->data_offset > rdev->new_data_offset) rdev->sectors += rdev->data_offset - rdev->new_data_offset; else rdev->sectors -= rdev->new_data_offset - rdev->data_offset; rdev->data_offset = rdev->new_data_offset; } } EXPORT_SYMBOL(md_finish_reshape); /* Bad block management */ /* Returns 1 on success, 0 on failure */ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new) { struct mddev *mddev = rdev->mddev; int rv; if (is_new) s += rdev->new_data_offset; else s += rdev->data_offset; rv = badblocks_set(&rdev->badblocks, s, sectors, 0); if (rv == 0) { /* Make sure they get written out promptly */ if (test_bit(ExternalBbl, &rdev->flags)) sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); sysfs_notify_dirent_safe(rdev->sysfs_state); set_mask_bits(&mddev->sb_flags, 0, BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); md_wakeup_thread(rdev->mddev->thread); return 1; } else return 0; } EXPORT_SYMBOL_GPL(rdev_set_badblocks); int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new) { int rv; if (is_new) s += rdev->new_data_offset; else s += rdev->data_offset; rv = badblocks_clear(&rdev->badblocks, s, sectors); if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags)) sysfs_notify_dirent_safe(rdev->sysfs_badblocks); return rv; } EXPORT_SYMBOL_GPL(rdev_clear_badblocks); static int md_notify_reboot(struct notifier_block *this, unsigned long code, void *x) { struct mddev *mddev, *n; int need_delay = 0; spin_lock(&all_mddevs_lock); list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { if (!mddev_get(mddev)) continue; spin_unlock(&all_mddevs_lock); if (mddev_trylock(mddev)) { if (mddev->pers) __md_stop_writes(mddev); if (mddev->persistent) mddev->safemode = 2; mddev_unlock(mddev); } need_delay = 1; mddev_put(mddev); spin_lock(&all_mddevs_lock); } spin_unlock(&all_mddevs_lock); /* * certain more exotic SCSI devices are known to be * volatile wrt too early system reboots. While the * right place to handle this issue is the given * driver, we do want to have a safe RAID driver ... */ if (need_delay) msleep(1000); return NOTIFY_DONE; } static struct notifier_block md_notifier = { .notifier_call = md_notify_reboot, .next = NULL, .priority = INT_MAX, /* before any real devices */ }; static void md_geninit(void) { pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops); } static int __init md_init(void) { int ret = -ENOMEM; md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); if (!md_wq) goto err_wq; md_misc_wq = alloc_workqueue("md_misc", 0, 0); if (!md_misc_wq) goto err_misc_wq; md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); if (!md_bitmap_wq) goto err_bitmap_wq; ret = __register_blkdev(MD_MAJOR, "md", md_probe); if (ret < 0) goto err_md; ret = __register_blkdev(0, "mdp", md_probe); if (ret < 0) goto err_mdp; mdp_major = ret; register_reboot_notifier(&md_notifier); raid_table_header = register_sysctl("dev/raid", raid_table); md_geninit(); return 0; err_mdp: unregister_blkdev(MD_MAJOR, "md"); err_md: destroy_workqueue(md_bitmap_wq); err_bitmap_wq: destroy_workqueue(md_misc_wq); err_misc_wq: destroy_workqueue(md_wq); err_wq: return ret; } static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) { struct mdp_superblock_1 *sb = page_address(rdev->sb_page); struct md_rdev *rdev2, *tmp; int role, ret; /* * If size is changed in another node then we need to * do resize as well. */ if (mddev->dev_sectors != le64_to_cpu(sb->size)) { ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size)); if (ret) pr_info("md-cluster: resize failed\n"); else md_bitmap_update_sb(mddev->bitmap); } /* Check for change of roles in the active devices */ rdev_for_each_safe(rdev2, tmp, mddev) { if (test_bit(Faulty, &rdev2->flags)) continue; /* Check if the roles changed */ role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); if (test_bit(Candidate, &rdev2->flags)) { if (role == MD_DISK_ROLE_FAULTY) { pr_info("md: Removing Candidate device %pg because add failed\n", rdev2->bdev); md_kick_rdev_from_array(rdev2); continue; } else clear_bit(Candidate, &rdev2->flags); } if (role != rdev2->raid_disk) { /* * got activated except reshape is happening. */ if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { rdev2->saved_raid_disk = role; ret = remove_and_add_spares(mddev, rdev2); pr_info("Activated spare: %pg\n", rdev2->bdev); /* wakeup mddev->thread here, so array could * perform resync with the new activated disk */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); } /* device faulty * We just want to do the minimum to mark the disk * as faulty. The recovery is performed by the * one who initiated the error. */ if (role == MD_DISK_ROLE_FAULTY || role == MD_DISK_ROLE_JOURNAL) { md_error(mddev, rdev2); clear_bit(Blocked, &rdev2->flags); } } } if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) { ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); if (ret) pr_warn("md: updating array disks failed. %d\n", ret); } /* * Since mddev->delta_disks has already updated in update_raid_disks, * so it is time to check reshape. */ if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { /* * reshape is happening in the remote node, we need to * update reshape_position and call start_reshape. */ mddev->reshape_position = le64_to_cpu(sb->reshape_position); if (mddev->pers->update_reshape_pos) mddev->pers->update_reshape_pos(mddev); if (mddev->pers->start_reshape) mddev->pers->start_reshape(mddev); } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && mddev->reshape_position != MaxSector && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { /* reshape is just done in another node. */ mddev->reshape_position = MaxSector; if (mddev->pers->update_reshape_pos) mddev->pers->update_reshape_pos(mddev); } /* Finally set the event to be up to date */ mddev->events = le64_to_cpu(sb->events); } static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) { int err; struct page *swapout = rdev->sb_page; struct mdp_superblock_1 *sb; /* Store the sb page of the rdev in the swapout temporary * variable in case we err in the future */ rdev->sb_page = NULL; err = alloc_disk_sb(rdev); if (err == 0) { ClearPageUptodate(rdev->sb_page); rdev->sb_loaded = 0; err = super_types[mddev->major_version]. load_super(rdev, NULL, mddev->minor_version); } if (err < 0) { pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", __func__, __LINE__, rdev->desc_nr, err); if (rdev->sb_page) put_page(rdev->sb_page); rdev->sb_page = swapout; rdev->sb_loaded = 1; return err; } sb = page_address(rdev->sb_page); /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET * is not set */ if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); /* The other node finished recovery, call spare_active to set * device In_sync and mddev->degraded */ if (rdev->recovery_offset == MaxSector && !test_bit(In_sync, &rdev->flags) && mddev->pers->spare_active(mddev)) sysfs_notify_dirent_safe(mddev->sysfs_degraded); put_page(swapout); return 0; } void md_reload_sb(struct mddev *mddev, int nr) { struct md_rdev *rdev = NULL, *iter; int err; /* Find the rdev */ rdev_for_each_rcu(iter, mddev) { if (iter->desc_nr == nr) { rdev = iter; break; } } if (!rdev) { pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); return; } err = read_rdev(mddev, rdev); if (err < 0) return; check_sb_changes(mddev, rdev); /* Read all rdev's to update recovery_offset */ rdev_for_each_rcu(rdev, mddev) { if (!test_bit(Faulty, &rdev->flags)) read_rdev(mddev, rdev); } } EXPORT_SYMBOL(md_reload_sb); #ifndef MODULE /* * Searches all registered partitions for autorun RAID arrays * at boot time. */ static DEFINE_MUTEX(detected_devices_mutex); static LIST_HEAD(all_detected_devices); struct detected_devices_node { struct list_head list; dev_t dev; }; void md_autodetect_dev(dev_t dev) { struct detected_devices_node *node_detected_dev; node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); if (node_detected_dev) { node_detected_dev->dev = dev; mutex_lock(&detected_devices_mutex); list_add_tail(&node_detected_dev->list, &all_detected_devices); mutex_unlock(&detected_devices_mutex); } } void md_autostart_arrays(int part) { struct md_rdev *rdev; struct detected_devices_node *node_detected_dev; dev_t dev; int i_scanned, i_passed; i_scanned = 0; i_passed = 0; pr_info("md: Autodetecting RAID arrays.\n"); mutex_lock(&detected_devices_mutex); while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { i_scanned++; node_detected_dev = list_entry(all_detected_devices.next, struct detected_devices_node, list); list_del(&node_detected_dev->list); dev = node_detected_dev->dev; kfree(node_detected_dev); mutex_unlock(&detected_devices_mutex); rdev = md_import_device(dev,0, 90); mutex_lock(&detected_devices_mutex); if (IS_ERR(rdev)) continue; if (test_bit(Faulty, &rdev->flags)) continue; set_bit(AutoDetected, &rdev->flags); list_add(&rdev->same_set, &pending_raid_disks); i_passed++; } mutex_unlock(&detected_devices_mutex); pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed); autorun_devices(part); } #endif /* !MODULE */ static __exit void md_exit(void) { struct mddev *mddev, *n; int delay = 1; unregister_blkdev(MD_MAJOR,"md"); unregister_blkdev(mdp_major, "mdp"); unregister_reboot_notifier(&md_notifier); unregister_sysctl_table(raid_table_header); /* We cannot unload the modules while some process is * waiting for us in select() or poll() - wake them up */ md_unloading = 1; while (waitqueue_active(&md_event_waiters)) { /* not safe to leave yet */ wake_up(&md_event_waiters); msleep(delay); delay += delay; } remove_proc_entry("mdstat", NULL); spin_lock(&all_mddevs_lock); list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { if (!mddev_get(mddev)) continue; spin_unlock(&all_mddevs_lock); export_array(mddev); mddev->ctime = 0; mddev->hold_active = 0; /* * As the mddev is now fully clear, mddev_put will schedule * the mddev for destruction by a workqueue, and the * destroy_workqueue() below will wait for that to complete. */ mddev_put(mddev); spin_lock(&all_mddevs_lock); } spin_unlock(&all_mddevs_lock); destroy_workqueue(md_misc_wq); destroy_workqueue(md_bitmap_wq); destroy_workqueue(md_wq); } subsys_initcall(md_init); module_exit(md_exit) static int get_ro(char *buffer, const struct kernel_param *kp) { return sprintf(buffer, "%d\n", start_readonly); } static int set_ro(const char *val, const struct kernel_param *kp) { return kstrtouint(val, 10, (unsigned int *)&start_readonly); } module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); module_param(create_on_open, bool, S_IRUSR|S_IWUSR); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MD RAID framework"); MODULE_ALIAS("md"); MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); |
7 6 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006, Thomas Gleixner * * This file contains the IRQ-resend code * * If the interrupt is waiting to be processed, we try to re-run it. * We can't directly run it from here since the caller might be in an * interrupt-protected region. Not all irq controller chips can * retrigger interrupts at the hardware level, so in those cases * we allow the resending of IRQs via a tasklet. */ #include <linux/irq.h> #include <linux/module.h> #include <linux/random.h> #include <linux/interrupt.h> #include "internals.h" #ifdef CONFIG_HARDIRQS_SW_RESEND /* hlist_head to handle software resend of interrupts: */ static HLIST_HEAD(irq_resend_list); static DEFINE_RAW_SPINLOCK(irq_resend_lock); /* * Run software resends of IRQ's */ static void resend_irqs(struct tasklet_struct *unused) { struct irq_desc *desc; raw_spin_lock_irq(&irq_resend_lock); while (!hlist_empty(&irq_resend_list)) { desc = hlist_entry(irq_resend_list.first, struct irq_desc, resend_node); hlist_del_init(&desc->resend_node); raw_spin_unlock(&irq_resend_lock); desc->handle_irq(desc); raw_spin_lock(&irq_resend_lock); } raw_spin_unlock_irq(&irq_resend_lock); } /* Tasklet to handle resend: */ static DECLARE_TASKLET(resend_tasklet, resend_irqs); static int irq_sw_resend(struct irq_desc *desc) { /* * Validate whether this interrupt can be safely injected from * non interrupt context */ if (handle_enforce_irqctx(&desc->irq_data)) return -EINVAL; /* * If the interrupt is running in the thread context of the parent * irq we need to be careful, because we cannot trigger it * directly. */ if (irq_settings_is_nested_thread(desc)) { /* * If the parent_irq is valid, we retrigger the parent, * otherwise we do nothing. */ if (!desc->parent_irq) return -EINVAL; desc = irq_to_desc(desc->parent_irq); if (!desc) return -EINVAL; } /* Add to resend_list and activate the softirq: */ raw_spin_lock(&irq_resend_lock); if (hlist_unhashed(&desc->resend_node)) hlist_add_head(&desc->resend_node, &irq_resend_list); raw_spin_unlock(&irq_resend_lock); tasklet_schedule(&resend_tasklet); return 0; } void clear_irq_resend(struct irq_desc *desc) { raw_spin_lock(&irq_resend_lock); hlist_del_init(&desc->resend_node); raw_spin_unlock(&irq_resend_lock); } void irq_resend_init(struct irq_desc *desc) { INIT_HLIST_NODE(&desc->resend_node); } #else void clear_irq_resend(struct irq_desc *desc) {} void irq_resend_init(struct irq_desc *desc) {} static int irq_sw_resend(struct irq_desc *desc) { return -EINVAL; } #endif static int try_retrigger(struct irq_desc *desc) { if (desc->irq_data.chip->irq_retrigger) return desc->irq_data.chip->irq_retrigger(&desc->irq_data); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY return irq_chip_retrigger_hierarchy(&desc->irq_data); #else return 0; #endif } /* * IRQ resend * * Is called with interrupts disabled and desc->lock held. */ int check_irq_resend(struct irq_desc *desc, bool inject) { int err = 0; /* * We do not resend level type interrupts. Level type interrupts * are resent by hardware when they are still active. Clear the * pending bit so suspend/resume does not get confused. */ if (irq_settings_is_level(desc)) { desc->istate &= ~IRQS_PENDING; return -EINVAL; } if (desc->istate & IRQS_REPLAY) return -EBUSY; if (!(desc->istate & IRQS_PENDING) && !inject) return 0; desc->istate &= ~IRQS_PENDING; if (!try_retrigger(desc)) err = irq_sw_resend(desc); /* If the retrigger was successful, mark it with the REPLAY bit */ if (!err) desc->istate |= IRQS_REPLAY; return err; } #ifdef CONFIG_GENERIC_IRQ_INJECTION /** * irq_inject_interrupt - Inject an interrupt for testing/error injection * @irq: The interrupt number * * This function must only be used for debug and testing purposes! * * Especially on x86 this can cause a premature completion of an interrupt * affinity change causing the interrupt line to become stale. Very * unlikely, but possible. * * The injection can fail for various reasons: * - Interrupt is not activated * - Interrupt is NMI type or currently replaying * - Interrupt is level type * - Interrupt does not support hardware retrigger and software resend is * either not enabled or not possible for the interrupt. */ int irq_inject_interrupt(unsigned int irq) { struct irq_desc *desc; unsigned long flags; int err; /* Try the state injection hardware interface first */ if (!irq_set_irqchip_state(irq, IRQCHIP_STATE_PENDING, true)) return 0; /* That failed, try via the resend mechanism */ desc = irq_get_desc_buslock(irq, &flags, 0); if (!desc) return -EINVAL; /* * Only try to inject when the interrupt is: * - not NMI type * - activated */ if ((desc->istate & IRQS_NMI) || !irqd_is_activated(&desc->irq_data)) err = -EINVAL; else err = check_irq_resend(desc, true); irq_put_desc_busunlock(desc, flags); return err; } EXPORT_SYMBOL_GPL(irq_inject_interrupt); #endif |
3 3 3 4 4 454 456 67 67 173 173 25 25 25 23 2 2 2 2 14737 14797 14746 14732 645 646 643 24 241 87 349 53 53 81 81 35 35 244 244 29 118 89 90 31 31 18 18 102 102 102 24 24 50 50 76 76 1016 41 48 930 925 926 3353 3353 1411 1413 4480 4529 18 4203 58 91 235 275 276 46 46 25 15 15 15 15 9 10 10 954 957 954 24 24 24 38 38 37 11 11 11 6 1 3 1 1 3 3 4 1 3 3 1 4 14 1 8 6 2 2 1 2 4 2 3 1 2 13 13 1351 1360 12 12 11 47 47 47 47 46 5 5 644 642 640 498 496 498 23 3 20 23 23 23 481 482 481 384 189 199 7 382 384 384 589 587 590 586 581 561 558 561 558 561 88 87 87 88 91 91 91 91 91 10733 10737 10738 10733 10737 10032 970 191 191 187 192 53 38 4406 4402 4403 4409 1362 3110 100 7648 7640 1 1 7743 34 5 29 46 46 52 53 53 53 53 53 53 53 53 916 913 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 | // SPDX-License-Identifier: GPL-2.0-only /* * AppArmor security module * * This file contains AppArmor LSM hooks. * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2010 Canonical Ltd. */ #include <linux/lsm_hooks.h> #include <linux/moduleparam.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/ptrace.h> #include <linux/ctype.h> #include <linux/sysctl.h> #include <linux/audit.h> #include <linux/user_namespace.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/zstd.h> #include <net/sock.h> #include <uapi/linux/mount.h> #include <uapi/linux/lsm.h> #include "include/apparmor.h" #include "include/apparmorfs.h" #include "include/audit.h" #include "include/capability.h" #include "include/cred.h" #include "include/file.h" #include "include/ipc.h" #include "include/net.h" #include "include/path.h" #include "include/label.h" #include "include/policy.h" #include "include/policy_ns.h" #include "include/procattr.h" #include "include/mount.h" #include "include/secid.h" /* Flag indicating whether initialization completed */ int apparmor_initialized; union aa_buffer { struct list_head list; DECLARE_FLEX_ARRAY(char, buffer); }; struct aa_local_cache { unsigned int hold; unsigned int count; struct list_head head; }; #define RESERVE_COUNT 2 static int reserve_count = RESERVE_COUNT; static int buffer_count; static LIST_HEAD(aa_global_buffers); static DEFINE_SPINLOCK(aa_buffers_lock); static DEFINE_PER_CPU(struct aa_local_cache, aa_local_buffers); /* * LSM hook functions */ /* * put the associated labels */ static void apparmor_cred_free(struct cred *cred) { aa_put_label(cred_label(cred)); set_cred_label(cred, NULL); } /* * allocate the apparmor part of blank credentials */ static int apparmor_cred_alloc_blank(struct cred *cred, gfp_t gfp) { set_cred_label(cred, NULL); return 0; } /* * prepare new cred label for modification by prepare_cred block */ static int apparmor_cred_prepare(struct cred *new, const struct cred *old, gfp_t gfp) { set_cred_label(new, aa_get_newest_label(cred_label(old))); return 0; } /* * transfer the apparmor data to a blank set of creds */ static void apparmor_cred_transfer(struct cred *new, const struct cred *old) { set_cred_label(new, aa_get_newest_label(cred_label(old))); } static void apparmor_task_free(struct task_struct *task) { aa_free_task_ctx(task_ctx(task)); } static int apparmor_task_alloc(struct task_struct *task, unsigned long clone_flags) { struct aa_task_ctx *new = task_ctx(task); aa_dup_task_ctx(new, task_ctx(current)); return 0; } static int apparmor_ptrace_access_check(struct task_struct *child, unsigned int mode) { struct aa_label *tracer, *tracee; const struct cred *cred; int error; cred = get_task_cred(child); tracee = cred_label(cred); /* ref count on cred */ tracer = __begin_current_label_crit_section(); error = aa_may_ptrace(current_cred(), tracer, cred, tracee, (mode & PTRACE_MODE_READ) ? AA_PTRACE_READ : AA_PTRACE_TRACE); __end_current_label_crit_section(tracer); put_cred(cred); return error; } static int apparmor_ptrace_traceme(struct task_struct *parent) { struct aa_label *tracer, *tracee; const struct cred *cred; int error; tracee = __begin_current_label_crit_section(); cred = get_task_cred(parent); tracer = cred_label(cred); /* ref count on cred */ error = aa_may_ptrace(cred, tracer, current_cred(), tracee, AA_PTRACE_TRACE); put_cred(cred); __end_current_label_crit_section(tracee); return error; } /* Derived from security/commoncap.c:cap_capget */ static int apparmor_capget(const struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { struct aa_label *label; const struct cred *cred; rcu_read_lock(); cred = __task_cred(target); label = aa_get_newest_cred_label(cred); /* * cap_capget is stacked ahead of this and will * initialize effective and permitted. */ if (!unconfined(label)) { struct aa_profile *profile; struct label_it i; label_for_each_confined(i, label, profile) { struct aa_ruleset *rules; if (COMPLAIN_MODE(profile)) continue; rules = list_first_entry(&profile->rules, typeof(*rules), list); *effective = cap_intersect(*effective, rules->caps.allow); *permitted = cap_intersect(*permitted, rules->caps.allow); } } rcu_read_unlock(); aa_put_label(label); return 0; } static int apparmor_capable(const struct cred *cred, struct user_namespace *ns, int cap, unsigned int opts) { struct aa_label *label; int error = 0; label = aa_get_newest_cred_label(cred); if (!unconfined(label)) error = aa_capable(cred, label, cap, opts); aa_put_label(label); return error; } /** * common_perm - basic common permission check wrapper fn for paths * @op: operation being checked * @path: path to check permission of (NOT NULL) * @mask: requested permissions mask * @cond: conditional info for the permission request (NOT NULL) * * Returns: %0 else error code if error or permission denied */ static int common_perm(const char *op, const struct path *path, u32 mask, struct path_cond *cond) { struct aa_label *label; int error = 0; label = __begin_current_label_crit_section(); if (!unconfined(label)) error = aa_path_perm(op, current_cred(), label, path, 0, mask, cond); __end_current_label_crit_section(label); return error; } /** * common_perm_cond - common permission wrapper around inode cond * @op: operation being checked * @path: location to check (NOT NULL) * @mask: requested permissions mask * * Returns: %0 else error code if error or permission denied */ static int common_perm_cond(const char *op, const struct path *path, u32 mask) { vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_idmap(path->mnt), d_backing_inode(path->dentry)); struct path_cond cond = { vfsuid_into_kuid(vfsuid), d_backing_inode(path->dentry)->i_mode }; if (!path_mediated_fs(path->dentry)) return 0; return common_perm(op, path, mask, &cond); } /** * common_perm_dir_dentry - common permission wrapper when path is dir, dentry * @op: operation being checked * @dir: directory of the dentry (NOT NULL) * @dentry: dentry to check (NOT NULL) * @mask: requested permissions mask * @cond: conditional info for the permission request (NOT NULL) * * Returns: %0 else error code if error or permission denied */ static int common_perm_dir_dentry(const char *op, const struct path *dir, struct dentry *dentry, u32 mask, struct path_cond *cond) { struct path path = { .mnt = dir->mnt, .dentry = dentry }; return common_perm(op, &path, mask, cond); } /** * common_perm_rm - common permission wrapper for operations doing rm * @op: operation being checked * @dir: directory that the dentry is in (NOT NULL) * @dentry: dentry being rm'd (NOT NULL) * @mask: requested permission mask * * Returns: %0 else error code if error or permission denied */ static int common_perm_rm(const char *op, const struct path *dir, struct dentry *dentry, u32 mask) { struct inode *inode = d_backing_inode(dentry); struct path_cond cond = { }; vfsuid_t vfsuid; if (!inode || !path_mediated_fs(dentry)) return 0; vfsuid = i_uid_into_vfsuid(mnt_idmap(dir->mnt), inode); cond.uid = vfsuid_into_kuid(vfsuid); cond.mode = inode->i_mode; return common_perm_dir_dentry(op, dir, dentry, mask, &cond); } /** * common_perm_create - common permission wrapper for operations doing create * @op: operation being checked * @dir: directory that dentry will be created in (NOT NULL) * @dentry: dentry to create (NOT NULL) * @mask: request permission mask * @mode: created file mode * * Returns: %0 else error code if error or permission denied */ static int common_perm_create(const char *op, const struct path *dir, struct dentry *dentry, u32 mask, umode_t mode) { struct path_cond cond = { current_fsuid(), mode }; if (!path_mediated_fs(dir->dentry)) return 0; return common_perm_dir_dentry(op, dir, dentry, mask, &cond); } static int apparmor_path_unlink(const struct path *dir, struct dentry *dentry) { return common_perm_rm(OP_UNLINK, dir, dentry, AA_MAY_DELETE); } static int apparmor_path_mkdir(const struct path *dir, struct dentry *dentry, umode_t mode) { return common_perm_create(OP_MKDIR, dir, dentry, AA_MAY_CREATE, S_IFDIR); } static int apparmor_path_rmdir(const struct path *dir, struct dentry *dentry) { return common_perm_rm(OP_RMDIR, dir, dentry, AA_MAY_DELETE); } static int apparmor_path_mknod(const struct path *dir, struct dentry *dentry, umode_t mode, unsigned int dev) { return common_perm_create(OP_MKNOD, dir, dentry, AA_MAY_CREATE, mode); } static int apparmor_path_truncate(const struct path *path) { return common_perm_cond(OP_TRUNC, path, MAY_WRITE | AA_MAY_SETATTR); } static int apparmor_file_truncate(struct file *file) { return apparmor_path_truncate(&file->f_path); } static int apparmor_path_symlink(const struct path *dir, struct dentry *dentry, const char *old_name) { return common_perm_create(OP_SYMLINK, dir, dentry, AA_MAY_CREATE, S_IFLNK); } static int apparmor_path_link(struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry) { struct aa_label *label; int error = 0; if (!path_mediated_fs(old_dentry)) return 0; label = begin_current_label_crit_section(); if (!unconfined(label)) error = aa_path_link(current_cred(), label, old_dentry, new_dir, new_dentry); end_current_label_crit_section(label); return error; } static int apparmor_path_rename(const struct path *old_dir, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry, const unsigned int flags) { struct aa_label *label; int error = 0; if (!path_mediated_fs(old_dentry)) return 0; if ((flags & RENAME_EXCHANGE) && !path_mediated_fs(new_dentry)) return 0; label = begin_current_label_crit_section(); if (!unconfined(label)) { struct mnt_idmap *idmap = mnt_idmap(old_dir->mnt); vfsuid_t vfsuid; struct path old_path = { .mnt = old_dir->mnt, .dentry = old_dentry }; struct path new_path = { .mnt = new_dir->mnt, .dentry = new_dentry }; struct path_cond cond = { .mode = d_backing_inode(old_dentry)->i_mode }; vfsuid = i_uid_into_vfsuid(idmap, d_backing_inode(old_dentry)); cond.uid = vfsuid_into_kuid(vfsuid); if (flags & RENAME_EXCHANGE) { struct path_cond cond_exchange = { .mode = d_backing_inode(new_dentry)->i_mode, }; vfsuid = i_uid_into_vfsuid(idmap, d_backing_inode(old_dentry)); cond_exchange.uid = vfsuid_into_kuid(vfsuid); error = aa_path_perm(OP_RENAME_SRC, current_cred(), label, &new_path, 0, MAY_READ | AA_MAY_GETATTR | MAY_WRITE | AA_MAY_SETATTR | AA_MAY_DELETE, &cond_exchange); if (!error) error = aa_path_perm(OP_RENAME_DEST, current_cred(), label, &old_path, 0, MAY_WRITE | AA_MAY_SETATTR | AA_MAY_CREATE, &cond_exchange); } if (!error) error = aa_path_perm(OP_RENAME_SRC, current_cred(), label, &old_path, 0, MAY_READ | AA_MAY_GETATTR | MAY_WRITE | AA_MAY_SETATTR | AA_MAY_DELETE, &cond); if (!error) error = aa_path_perm(OP_RENAME_DEST, current_cred(), label, &new_path, 0, MAY_WRITE | AA_MAY_SETATTR | AA_MAY_CREATE, &cond); } end_current_label_crit_section(label); return error; } static int apparmor_path_chmod(const struct path *path, umode_t mode) { return common_perm_cond(OP_CHMOD, path, AA_MAY_CHMOD); } static int apparmor_path_chown(const struct path *path, kuid_t uid, kgid_t gid) { return common_perm_cond(OP_CHOWN, path, AA_MAY_CHOWN); } static int apparmor_inode_getattr(const struct path *path) { return common_perm_cond(OP_GETATTR, path, AA_MAY_GETATTR); } static int apparmor_file_open(struct file *file) { struct aa_file_ctx *fctx = file_ctx(file); struct aa_label *label; int error = 0; if (!path_mediated_fs(file->f_path.dentry)) return 0; /* If in exec, permission is handled by bprm hooks. * Cache permissions granted by the previous exec check, with * implicit read and executable mmap which are required to * actually execute the image. * * Illogically, FMODE_EXEC is in f_flags, not f_mode. */ if (file->f_flags & __FMODE_EXEC) { fctx->allow = MAY_EXEC | MAY_READ | AA_EXEC_MMAP; return 0; } label = aa_get_newest_cred_label(file->f_cred); if (!unconfined(label)) { struct mnt_idmap *idmap = file_mnt_idmap(file); struct inode *inode = file_inode(file); vfsuid_t vfsuid; struct path_cond cond = { .mode = inode->i_mode, }; vfsuid = i_uid_into_vfsuid(idmap, inode); cond.uid = vfsuid_into_kuid(vfsuid); error = aa_path_perm(OP_OPEN, file->f_cred, label, &file->f_path, 0, aa_map_file_to_perms(file), &cond); /* todo cache full allowed permissions set and state */ fctx->allow = aa_map_file_to_perms(file); } aa_put_label(label); return error; } static int apparmor_file_alloc_security(struct file *file) { struct aa_file_ctx *ctx = file_ctx(file); struct aa_label *label = begin_current_label_crit_section(); spin_lock_init(&ctx->lock); rcu_assign_pointer(ctx->label, aa_get_label(label)); end_current_label_crit_section(label); return 0; } static void apparmor_file_free_security(struct file *file) { struct aa_file_ctx *ctx = file_ctx(file); if (ctx) aa_put_label(rcu_access_pointer(ctx->label)); } static int common_file_perm(const char *op, struct file *file, u32 mask, bool in_atomic) { struct aa_label *label; int error = 0; /* don't reaudit files closed during inheritance */ if (file->f_path.dentry == aa_null.dentry) return -EACCES; label = __begin_current_label_crit_section(); error = aa_file_perm(op, current_cred(), label, file, mask, in_atomic); __end_current_label_crit_section(label); return error; } static int apparmor_file_receive(struct file *file) { return common_file_perm(OP_FRECEIVE, file, aa_map_file_to_perms(file), false); } static int apparmor_file_permission(struct file *file, int mask) { return common_file_perm(OP_FPERM, file, mask, false); } static int apparmor_file_lock(struct file *file, unsigned int cmd) { u32 mask = AA_MAY_LOCK; if (cmd == F_WRLCK) mask |= MAY_WRITE; return common_file_perm(OP_FLOCK, file, mask, false); } static int common_mmap(const char *op, struct file *file, unsigned long prot, unsigned long flags, bool in_atomic) { int mask = 0; if (!file || !file_ctx(file)) return 0; if (prot & PROT_READ) mask |= MAY_READ; /* * Private mappings don't require write perms since they don't * write back to the files */ if ((prot & PROT_WRITE) && !(flags & MAP_PRIVATE)) mask |= MAY_WRITE; if (prot & PROT_EXEC) mask |= AA_EXEC_MMAP; return common_file_perm(op, file, mask, in_atomic); } static int apparmor_mmap_file(struct file *file, unsigned long reqprot, unsigned long prot, unsigned long flags) { return common_mmap(OP_FMMAP, file, prot, flags, GFP_ATOMIC); } static int apparmor_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot) { return common_mmap(OP_FMPROT, vma->vm_file, prot, !(vma->vm_flags & VM_SHARED) ? MAP_PRIVATE : 0, false); } #ifdef CONFIG_IO_URING static const char *audit_uring_mask(u32 mask) { if (mask & AA_MAY_CREATE_SQPOLL) return "sqpoll"; if (mask & AA_MAY_OVERRIDE_CRED) return "override_creds"; return ""; } static void audit_uring_cb(struct audit_buffer *ab, void *va) { struct apparmor_audit_data *ad = aad_of_va(va); if (ad->request & AA_URING_PERM_MASK) { audit_log_format(ab, " requested=\"%s\"", audit_uring_mask(ad->request)); if (ad->denied & AA_URING_PERM_MASK) { audit_log_format(ab, " denied=\"%s\"", audit_uring_mask(ad->denied)); } } if (ad->uring.target) { audit_log_format(ab, " tcontext="); aa_label_xaudit(ab, labels_ns(ad->subj_label), ad->uring.target, FLAGS_NONE, GFP_ATOMIC); } } static int profile_uring(struct aa_profile *profile, u32 request, struct aa_label *new, int cap, struct apparmor_audit_data *ad) { unsigned int state; struct aa_ruleset *rules; int error = 0; AA_BUG(!profile); rules = list_first_entry(&profile->rules, typeof(*rules), list); state = RULE_MEDIATES(rules, AA_CLASS_IO_URING); if (state) { struct aa_perms perms = { }; if (new) { aa_label_match(profile, rules, new, state, false, request, &perms); } else { perms = *aa_lookup_perms(rules->policy, state); } aa_apply_modes_to_perms(profile, &perms); error = aa_check_perms(profile, &perms, request, ad, audit_uring_cb); } return error; } /** * apparmor_uring_override_creds - check the requested cred override * @new: the target creds * * Check to see if the current task is allowed to override it's credentials * to service an io_uring operation. */ static int apparmor_uring_override_creds(const struct cred *new) { struct aa_profile *profile; struct aa_label *label; int error; DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_NONE, AA_CLASS_IO_URING, OP_URING_OVERRIDE); ad.uring.target = cred_label(new); label = __begin_current_label_crit_section(); error = fn_for_each(label, profile, profile_uring(profile, AA_MAY_OVERRIDE_CRED, cred_label(new), CAP_SYS_ADMIN, &ad)); __end_current_label_crit_section(label); return error; } /** * apparmor_uring_sqpoll - check if a io_uring polling thread can be created * * Check to see if the current task is allowed to create a new io_uring * kernel polling thread. */ static int apparmor_uring_sqpoll(void) { struct aa_profile *profile; struct aa_label *label; int error; DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_NONE, AA_CLASS_IO_URING, OP_URING_SQPOLL); label = __begin_current_label_crit_section(); error = fn_for_each(label, profile, profile_uring(profile, AA_MAY_CREATE_SQPOLL, NULL, CAP_SYS_ADMIN, &ad)); __end_current_label_crit_section(label); return error; } #endif /* CONFIG_IO_URING */ static int apparmor_sb_mount(const char *dev_name, const struct path *path, const char *type, unsigned long flags, void *data) { struct aa_label *label; int error = 0; /* Discard magic */ if ((flags & MS_MGC_MSK) == MS_MGC_VAL) flags &= ~MS_MGC_MSK; flags &= ~AA_MS_IGNORE_MASK; label = __begin_current_label_crit_section(); if (!unconfined(label)) { if (flags & MS_REMOUNT) error = aa_remount(current_cred(), label, path, flags, data); else if (flags & MS_BIND) error = aa_bind_mount(current_cred(), label, path, dev_name, flags); else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) error = aa_mount_change_type(current_cred(), label, path, flags); else if (flags & MS_MOVE) error = aa_move_mount_old(current_cred(), label, path, dev_name); else error = aa_new_mount(current_cred(), label, dev_name, path, type, flags, data); } __end_current_label_crit_section(label); return error; } static int apparmor_move_mount(const struct path *from_path, const struct path *to_path) { struct aa_label *label; int error = 0; label = __begin_current_label_crit_section(); if (!unconfined(label)) error = aa_move_mount(current_cred(), label, from_path, to_path); __end_current_label_crit_section(label); return error; } static int apparmor_sb_umount(struct vfsmount *mnt, int flags) { struct aa_label *label; int error = 0; label = __begin_current_label_crit_section(); if (!unconfined(label)) error = aa_umount(current_cred(), label, mnt, flags); __end_current_label_crit_section(label); return error; } static int apparmor_sb_pivotroot(const struct path *old_path, const struct path *new_path) { struct aa_label *label; int error = 0; label = aa_get_current_label(); if (!unconfined(label)) error = aa_pivotroot(current_cred(), label, old_path, new_path); aa_put_label(label); return error; } static int apparmor_getselfattr(unsigned int attr, struct lsm_ctx __user *lx, u32 *size, u32 flags) { int error = -ENOENT; struct aa_task_ctx *ctx = task_ctx(current); struct aa_label *label = NULL; char *value = NULL; switch (attr) { case LSM_ATTR_CURRENT: label = aa_get_newest_label(cred_label(current_cred())); break; case LSM_ATTR_PREV: if (ctx->previous) label = aa_get_newest_label(ctx->previous); break; case LSM_ATTR_EXEC: if (ctx->onexec) label = aa_get_newest_label(ctx->onexec); break; default: error = -EOPNOTSUPP; break; } if (label) { error = aa_getprocattr(label, &value, false); if (error > 0) error = lsm_fill_user_ctx(lx, size, value, error, LSM_ID_APPARMOR, 0); kfree(value); } aa_put_label(label); if (error < 0) return error; return 1; } static int apparmor_getprocattr(struct task_struct *task, const char *name, char **value) { int error = -ENOENT; /* released below */ const struct cred *cred = get_task_cred(task); struct aa_task_ctx *ctx = task_ctx(current); struct aa_label *label = NULL; if (strcmp(name, "current") == 0) label = aa_get_newest_label(cred_label(cred)); else if (strcmp(name, "prev") == 0 && ctx->previous) label = aa_get_newest_label(ctx->previous); else if (strcmp(name, "exec") == 0 && ctx->onexec) label = aa_get_newest_label(ctx->onexec); else error = -EINVAL; if (label) error = aa_getprocattr(label, value, true); aa_put_label(label); put_cred(cred); return error; } static int do_setattr(u64 attr, void *value, size_t size) { char *command, *largs = NULL, *args = value; size_t arg_size; int error; DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_NONE, AA_CLASS_NONE, OP_SETPROCATTR); if (size == 0) return -EINVAL; /* AppArmor requires that the buffer must be null terminated atm */ if (args[size - 1] != '\0') { /* null terminate */ largs = args = kmalloc(size + 1, GFP_KERNEL); if (!args) return -ENOMEM; memcpy(args, value, size); args[size] = '\0'; } error = -EINVAL; args = strim(args); command = strsep(&args, " "); if (!args) goto out; args = skip_spaces(args); if (!*args) goto out; arg_size = size - (args - (largs ? largs : (char *) value)); if (attr == LSM_ATTR_CURRENT) { if (strcmp(command, "changehat") == 0) { error = aa_setprocattr_changehat(args, arg_size, AA_CHANGE_NOFLAGS); } else if (strcmp(command, "permhat") == 0) { error = aa_setprocattr_changehat(args, arg_size, AA_CHANGE_TEST); } else if (strcmp(command, "changeprofile") == 0) { error = aa_change_profile(args, AA_CHANGE_NOFLAGS); } else if (strcmp(command, "permprofile") == 0) { error = aa_change_profile(args, AA_CHANGE_TEST); } else if (strcmp(command, "stack") == 0) { error = aa_change_profile(args, AA_CHANGE_STACK); } else goto fail; } else if (attr == LSM_ATTR_EXEC) { if (strcmp(command, "exec") == 0) error = aa_change_profile(args, AA_CHANGE_ONEXEC); else if (strcmp(command, "stack") == 0) error = aa_change_profile(args, (AA_CHANGE_ONEXEC | AA_CHANGE_STACK)); else goto fail; } else /* only support the "current" and "exec" process attributes */ goto fail; if (!error) error = size; out: kfree(largs); return error; fail: ad.subj_label = begin_current_label_crit_section(); if (attr == LSM_ATTR_CURRENT) ad.info = "current"; else if (attr == LSM_ATTR_EXEC) ad.info = "exec"; else ad.info = "invalid"; ad.error = error = -EINVAL; aa_audit_msg(AUDIT_APPARMOR_DENIED, &ad, NULL); end_current_label_crit_section(ad.subj_label); goto out; } static int apparmor_setselfattr(unsigned int attr, struct lsm_ctx *ctx, u32 size, u32 flags) { int rc; if (attr != LSM_ATTR_CURRENT && attr != LSM_ATTR_EXEC) return -EOPNOTSUPP; rc = do_setattr(attr, ctx->ctx, ctx->ctx_len); if (rc > 0) return 0; return rc; } static int apparmor_setprocattr(const char *name, void *value, size_t size) { int attr = lsm_name_to_attr(name); if (attr) return do_setattr(attr, value, size); return -EINVAL; } /** * apparmor_bprm_committing_creds - do task cleanup on committing new creds * @bprm: binprm for the exec (NOT NULL) */ static void apparmor_bprm_committing_creds(const struct linux_binprm *bprm) { struct aa_label *label = aa_current_raw_label(); struct aa_label *new_label = cred_label(bprm->cred); /* bail out if unconfined or not changing profile */ if ((new_label->proxy == label->proxy) || (unconfined(new_label))) return; aa_inherit_files(bprm->cred, current->files); current->pdeath_signal = 0; /* reset soft limits and set hard limits for the new label */ __aa_transition_rlimits(label, new_label); } /** * apparmor_bprm_committed_creds() - do cleanup after new creds committed * @bprm: binprm for the exec (NOT NULL) */ static void apparmor_bprm_committed_creds(const struct linux_binprm *bprm) { /* clear out temporary/transitional state from the context */ aa_clear_task_ctx_trans(task_ctx(current)); return; } static void apparmor_current_getsecid_subj(u32 *secid) { struct aa_label *label = __begin_current_label_crit_section(); *secid = label->secid; __end_current_label_crit_section(label); } static void apparmor_task_getsecid_obj(struct task_struct *p, u32 *secid) { struct aa_label *label = aa_get_task_label(p); *secid = label->secid; aa_put_label(label); } static int apparmor_task_setrlimit(struct task_struct *task, unsigned int resource, struct rlimit *new_rlim) { struct aa_label *label = __begin_current_label_crit_section(); int error = 0; if (!unconfined(label)) error = aa_task_setrlimit(current_cred(), label, task, resource, new_rlim); __end_current_label_crit_section(label); return error; } static int apparmor_task_kill(struct task_struct *target, struct kernel_siginfo *info, int sig, const struct cred *cred) { const struct cred *tc; struct aa_label *cl, *tl; int error; tc = get_task_cred(target); tl = aa_get_newest_cred_label(tc); if (cred) { /* * Dealing with USB IO specific behavior */ cl = aa_get_newest_cred_label(cred); error = aa_may_signal(cred, cl, tc, tl, sig); aa_put_label(cl); } else { cl = __begin_current_label_crit_section(); error = aa_may_signal(current_cred(), cl, tc, tl, sig); __end_current_label_crit_section(cl); } aa_put_label(tl); put_cred(tc); return error; } static int apparmor_userns_create(const struct cred *cred) { struct aa_label *label; struct aa_profile *profile; int error = 0; DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_TASK, AA_CLASS_NS, OP_USERNS_CREATE); ad.subj_cred = current_cred(); label = begin_current_label_crit_section(); if (!unconfined(label)) { error = fn_for_each(label, profile, aa_profile_ns_perm(profile, &ad, AA_USERNS_CREATE)); } end_current_label_crit_section(label); return error; } static int apparmor_sk_alloc_security(struct sock *sk, int family, gfp_t flags) { struct aa_sk_ctx *ctx; ctx = kzalloc(sizeof(*ctx), flags); if (!ctx) return -ENOMEM; sk->sk_security = ctx; return 0; } static void apparmor_sk_free_security(struct sock *sk) { struct aa_sk_ctx *ctx = aa_sock(sk); sk->sk_security = NULL; aa_put_label(ctx->label); aa_put_label(ctx->peer); kfree(ctx); } /** * apparmor_sk_clone_security - clone the sk_security field * @sk: sock to have security cloned * @newsk: sock getting clone */ static void apparmor_sk_clone_security(const struct sock *sk, struct sock *newsk) { struct aa_sk_ctx *ctx = aa_sock(sk); struct aa_sk_ctx *new = aa_sock(newsk); if (new->label) aa_put_label(new->label); new->label = aa_get_label(ctx->label); if (new->peer) aa_put_label(new->peer); new->peer = aa_get_label(ctx->peer); } static int apparmor_socket_create(int family, int type, int protocol, int kern) { struct aa_label *label; int error = 0; AA_BUG(in_interrupt()); label = begin_current_label_crit_section(); if (!(kern || unconfined(label))) error = af_select(family, create_perm(label, family, type, protocol), aa_af_perm(current_cred(), label, OP_CREATE, AA_MAY_CREATE, family, type, protocol)); end_current_label_crit_section(label); return error; } /** * apparmor_socket_post_create - setup the per-socket security struct * @sock: socket that is being setup * @family: family of socket being created * @type: type of the socket * @ptotocol: protocol of the socket * @kern: socket is a special kernel socket * * Note: * - kernel sockets labeled kernel_t used to use unconfined * - socket may not have sk here if created with sock_create_lite or * sock_alloc. These should be accept cases which will be handled in * sock_graft. */ static int apparmor_socket_post_create(struct socket *sock, int family, int type, int protocol, int kern) { struct aa_label *label; if (kern) { label = aa_get_label(kernel_t); } else label = aa_get_current_label(); if (sock->sk) { struct aa_sk_ctx *ctx = aa_sock(sock->sk); aa_put_label(ctx->label); ctx->label = aa_get_label(label); } aa_put_label(label); return 0; } static int apparmor_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(!address); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, bind_perm(sock, address, addrlen), aa_sk_perm(OP_BIND, AA_MAY_BIND, sock->sk)); } static int apparmor_socket_connect(struct socket *sock, struct sockaddr *address, int addrlen) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(!address); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, connect_perm(sock, address, addrlen), aa_sk_perm(OP_CONNECT, AA_MAY_CONNECT, sock->sk)); } static int apparmor_socket_listen(struct socket *sock, int backlog) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, listen_perm(sock, backlog), aa_sk_perm(OP_LISTEN, AA_MAY_LISTEN, sock->sk)); } /* * Note: while @newsock is created and has some information, the accept * has not been done. */ static int apparmor_socket_accept(struct socket *sock, struct socket *newsock) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(!newsock); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, accept_perm(sock, newsock), aa_sk_perm(OP_ACCEPT, AA_MAY_ACCEPT, sock->sk)); } static int aa_sock_msg_perm(const char *op, u32 request, struct socket *sock, struct msghdr *msg, int size) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(!msg); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, msg_perm(op, request, sock, msg, size), aa_sk_perm(op, request, sock->sk)); } static int apparmor_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size) { return aa_sock_msg_perm(OP_SENDMSG, AA_MAY_SEND, sock, msg, size); } static int apparmor_socket_recvmsg(struct socket *sock, struct msghdr *msg, int size, int flags) { return aa_sock_msg_perm(OP_RECVMSG, AA_MAY_RECEIVE, sock, msg, size); } /* revaliation, get/set attr, shutdown */ static int aa_sock_perm(const char *op, u32 request, struct socket *sock) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, sock_perm(op, request, sock), aa_sk_perm(op, request, sock->sk)); } static int apparmor_socket_getsockname(struct socket *sock) { return aa_sock_perm(OP_GETSOCKNAME, AA_MAY_GETATTR, sock); } static int apparmor_socket_getpeername(struct socket *sock) { return aa_sock_perm(OP_GETPEERNAME, AA_MAY_GETATTR, sock); } /* revaliation, get/set attr, opt */ static int aa_sock_opt_perm(const char *op, u32 request, struct socket *sock, int level, int optname) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, opt_perm(op, request, sock, level, optname), aa_sk_perm(op, request, sock->sk)); } static int apparmor_socket_getsockopt(struct socket *sock, int level, int optname) { return aa_sock_opt_perm(OP_GETSOCKOPT, AA_MAY_GETOPT, sock, level, optname); } static int apparmor_socket_setsockopt(struct socket *sock, int level, int optname) { return aa_sock_opt_perm(OP_SETSOCKOPT, AA_MAY_SETOPT, sock, level, optname); } static int apparmor_socket_shutdown(struct socket *sock, int how) { return aa_sock_perm(OP_SHUTDOWN, AA_MAY_SHUTDOWN, sock); } #ifdef CONFIG_NETWORK_SECMARK /** * apparmor_socket_sock_rcv_skb - check perms before associating skb to sk * @sk: sk to associate @skb with * @skb: skb to check for perms * * Note: can not sleep may be called with locks held * * dont want protocol specific in __skb_recv_datagram() * to deny an incoming connection socket_sock_rcv_skb() */ static int apparmor_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct aa_sk_ctx *ctx = aa_sock(sk); if (!skb->secmark) return 0; return apparmor_secmark_check(ctx->label, OP_RECVMSG, AA_MAY_RECEIVE, skb->secmark, sk); } #endif static struct aa_label *sk_peer_label(struct sock *sk) { struct aa_sk_ctx *ctx = aa_sock(sk); if (ctx->peer) return ctx->peer; return ERR_PTR(-ENOPROTOOPT); } /** * apparmor_socket_getpeersec_stream - get security context of peer * @sock: socket that we are trying to get the peer context of * @optval: output - buffer to copy peer name to * @optlen: output - size of copied name in @optval * @len: size of @optval buffer * Returns: 0 on success, -errno of failure * * Note: for tcp only valid if using ipsec or cipso on lan */ static int apparmor_socket_getpeersec_stream(struct socket *sock, sockptr_t optval, sockptr_t optlen, unsigned int len) { char *name = NULL; int slen, error = 0; struct aa_label *label; struct aa_label *peer; label = begin_current_label_crit_section(); peer = sk_peer_label(sock->sk); if (IS_ERR(peer)) { error = PTR_ERR(peer); goto done; } slen = aa_label_asxprint(&name, labels_ns(label), peer, FLAG_SHOW_MODE | FLAG_VIEW_SUBNS | FLAG_HIDDEN_UNCONFINED, GFP_KERNEL); /* don't include terminating \0 in slen, it breaks some apps */ if (slen < 0) { error = -ENOMEM; goto done; } if (slen > len) { error = -ERANGE; goto done_len; } if (copy_to_sockptr(optval, name, slen)) error = -EFAULT; done_len: if (copy_to_sockptr(optlen, &slen, sizeof(slen))) error = -EFAULT; done: end_current_label_crit_section(label); kfree(name); return error; } /** * apparmor_socket_getpeersec_dgram - get security label of packet * @sock: the peer socket * @skb: packet data * @secid: pointer to where to put the secid of the packet * * Sets the netlabel socket state on sk from parent */ static int apparmor_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid) { /* TODO: requires secid support */ return -ENOPROTOOPT; } /** * apparmor_sock_graft - Initialize newly created socket * @sk: child sock * @parent: parent socket * * Note: could set off of SOCK_CTX(parent) but need to track inode and we can * just set sk security information off of current creating process label * Labeling of sk for accept case - probably should be sock based * instead of task, because of the case where an implicitly labeled * socket is shared by different tasks. */ static void apparmor_sock_graft(struct sock *sk, struct socket *parent) { struct aa_sk_ctx *ctx = aa_sock(sk); if (!ctx->label) ctx->label = aa_get_current_label(); } #ifdef CONFIG_NETWORK_SECMARK static int apparmor_inet_conn_request(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { struct aa_sk_ctx *ctx = aa_sock(sk); if (!skb->secmark) return 0; return apparmor_secmark_check(ctx->label, OP_CONNECT, AA_MAY_CONNECT, skb->secmark, sk); } #endif /* * The cred blob is a pointer to, not an instance of, an aa_label. */ struct lsm_blob_sizes apparmor_blob_sizes __ro_after_init = { .lbs_cred = sizeof(struct aa_label *), .lbs_file = sizeof(struct aa_file_ctx), .lbs_task = sizeof(struct aa_task_ctx), }; static const struct lsm_id apparmor_lsmid = { .name = "apparmor", .id = LSM_ID_APPARMOR, }; static struct security_hook_list apparmor_hooks[] __ro_after_init = { LSM_HOOK_INIT(ptrace_access_check, apparmor_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, apparmor_ptrace_traceme), LSM_HOOK_INIT(capget, apparmor_capget), LSM_HOOK_INIT(capable, apparmor_capable), LSM_HOOK_INIT(move_mount, apparmor_move_mount), LSM_HOOK_INIT(sb_mount, apparmor_sb_mount), LSM_HOOK_INIT(sb_umount, apparmor_sb_umount), LSM_HOOK_INIT(sb_pivotroot, apparmor_sb_pivotroot), LSM_HOOK_INIT(path_link, apparmor_path_link), LSM_HOOK_INIT(path_unlink, apparmor_path_unlink), LSM_HOOK_INIT(path_symlink, apparmor_path_symlink), LSM_HOOK_INIT(path_mkdir, apparmor_path_mkdir), LSM_HOOK_INIT(path_rmdir, apparmor_path_rmdir), LSM_HOOK_INIT(path_mknod, apparmor_path_mknod), LSM_HOOK_INIT(path_rename, apparmor_path_rename), LSM_HOOK_INIT(path_chmod, apparmor_path_chmod), LSM_HOOK_INIT(path_chown, apparmor_path_chown), LSM_HOOK_INIT(path_truncate, apparmor_path_truncate), LSM_HOOK_INIT(inode_getattr, apparmor_inode_getattr), LSM_HOOK_INIT(file_open, apparmor_file_open), LSM_HOOK_INIT(file_receive, apparmor_file_receive), LSM_HOOK_INIT(file_permission, apparmor_file_permission), LSM_HOOK_INIT(file_alloc_security, apparmor_file_alloc_security), LSM_HOOK_INIT(file_free_security, apparmor_file_free_security), LSM_HOOK_INIT(mmap_file, apparmor_mmap_file), LSM_HOOK_INIT(file_mprotect, apparmor_file_mprotect), LSM_HOOK_INIT(file_lock, apparmor_file_lock), LSM_HOOK_INIT(file_truncate, apparmor_file_truncate), LSM_HOOK_INIT(getselfattr, apparmor_getselfattr), LSM_HOOK_INIT(setselfattr, apparmor_setselfattr), LSM_HOOK_INIT(getprocattr, apparmor_getprocattr), LSM_HOOK_INIT(setprocattr, apparmor_setprocattr), LSM_HOOK_INIT(sk_alloc_security, apparmor_sk_alloc_security), LSM_HOOK_INIT(sk_free_security, apparmor_sk_free_security), LSM_HOOK_INIT(sk_clone_security, apparmor_sk_clone_security), LSM_HOOK_INIT(socket_create, apparmor_socket_create), LSM_HOOK_INIT(socket_post_create, apparmor_socket_post_create), LSM_HOOK_INIT(socket_bind, apparmor_socket_bind), LSM_HOOK_INIT(socket_connect, apparmor_socket_connect), LSM_HOOK_INIT(socket_listen, apparmor_socket_listen), LSM_HOOK_INIT(socket_accept, apparmor_socket_accept), LSM_HOOK_INIT(socket_sendmsg, apparmor_socket_sendmsg), LSM_HOOK_INIT(socket_recvmsg, apparmor_socket_recvmsg), LSM_HOOK_INIT(socket_getsockname, apparmor_socket_getsockname), LSM_HOOK_INIT(socket_getpeername, apparmor_socket_getpeername), LSM_HOOK_INIT(socket_getsockopt, apparmor_socket_getsockopt), LSM_HOOK_INIT(socket_setsockopt, apparmor_socket_setsockopt), LSM_HOOK_INIT(socket_shutdown, apparmor_socket_shutdown), #ifdef CONFIG_NETWORK_SECMARK LSM_HOOK_INIT(socket_sock_rcv_skb, apparmor_socket_sock_rcv_skb), #endif LSM_HOOK_INIT(socket_getpeersec_stream, apparmor_socket_getpeersec_stream), LSM_HOOK_INIT(socket_getpeersec_dgram, apparmor_socket_getpeersec_dgram), LSM_HOOK_INIT(sock_graft, apparmor_sock_graft), #ifdef CONFIG_NETWORK_SECMARK LSM_HOOK_INIT(inet_conn_request, apparmor_inet_conn_request), #endif LSM_HOOK_INIT(cred_alloc_blank, apparmor_cred_alloc_blank), LSM_HOOK_INIT(cred_free, apparmor_cred_free), LSM_HOOK_INIT(cred_prepare, apparmor_cred_prepare), LSM_HOOK_INIT(cred_transfer, apparmor_cred_transfer), LSM_HOOK_INIT(bprm_creds_for_exec, apparmor_bprm_creds_for_exec), LSM_HOOK_INIT(bprm_committing_creds, apparmor_bprm_committing_creds), LSM_HOOK_INIT(bprm_committed_creds, apparmor_bprm_committed_creds), LSM_HOOK_INIT(task_free, apparmor_task_free), LSM_HOOK_INIT(task_alloc, apparmor_task_alloc), LSM_HOOK_INIT(current_getsecid_subj, apparmor_current_getsecid_subj), LSM_HOOK_INIT(task_getsecid_obj, apparmor_task_getsecid_obj), LSM_HOOK_INIT(task_setrlimit, apparmor_task_setrlimit), LSM_HOOK_INIT(task_kill, apparmor_task_kill), LSM_HOOK_INIT(userns_create, apparmor_userns_create), #ifdef CONFIG_AUDIT LSM_HOOK_INIT(audit_rule_init, aa_audit_rule_init), LSM_HOOK_INIT(audit_rule_known, aa_audit_rule_known), LSM_HOOK_INIT(audit_rule_match, aa_audit_rule_match), LSM_HOOK_INIT(audit_rule_free, aa_audit_rule_free), #endif LSM_HOOK_INIT(secid_to_secctx, apparmor_secid_to_secctx), LSM_HOOK_INIT(secctx_to_secid, apparmor_secctx_to_secid), LSM_HOOK_INIT(release_secctx, apparmor_release_secctx), #ifdef CONFIG_IO_URING LSM_HOOK_INIT(uring_override_creds, apparmor_uring_override_creds), LSM_HOOK_INIT(uring_sqpoll, apparmor_uring_sqpoll), #endif }; /* * AppArmor sysfs module parameters */ static int param_set_aabool(const char *val, const struct kernel_param *kp); static int param_get_aabool(char *buffer, const struct kernel_param *kp); #define param_check_aabool param_check_bool static const struct kernel_param_ops param_ops_aabool = { .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_aabool, .get = param_get_aabool }; static int param_set_aauint(const char *val, const struct kernel_param *kp); static int param_get_aauint(char *buffer, const struct kernel_param *kp); #define param_check_aauint param_check_uint static const struct kernel_param_ops param_ops_aauint = { .set = param_set_aauint, .get = param_get_aauint }; static int param_set_aacompressionlevel(const char *val, const struct kernel_param *kp); static int param_get_aacompressionlevel(char *buffer, const struct kernel_param *kp); #define param_check_aacompressionlevel param_check_int static const struct kernel_param_ops param_ops_aacompressionlevel = { .set = param_set_aacompressionlevel, .get = param_get_aacompressionlevel }; static int param_set_aalockpolicy(const char *val, const struct kernel_param *kp); static int param_get_aalockpolicy(char *buffer, const struct kernel_param *kp); #define param_check_aalockpolicy param_check_bool static const struct kernel_param_ops param_ops_aalockpolicy = { .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_aalockpolicy, .get = param_get_aalockpolicy }; static int param_set_audit(const char *val, const struct kernel_param *kp); static int param_get_audit(char *buffer, const struct kernel_param *kp); static int param_set_mode(const char *val, const struct kernel_param *kp); static int param_get_mode(char *buffer, const struct kernel_param *kp); /* Flag values, also controllable via /sys/module/apparmor/parameters * We define special types as we want to do additional mediation. */ /* AppArmor global enforcement switch - complain, enforce, kill */ enum profile_mode aa_g_profile_mode = APPARMOR_ENFORCE; module_param_call(mode, param_set_mode, param_get_mode, &aa_g_profile_mode, S_IRUSR | S_IWUSR); /* whether policy verification hashing is enabled */ bool aa_g_hash_policy = IS_ENABLED(CONFIG_SECURITY_APPARMOR_HASH_DEFAULT); #ifdef CONFIG_SECURITY_APPARMOR_HASH module_param_named(hash_policy, aa_g_hash_policy, aabool, S_IRUSR | S_IWUSR); #endif /* whether policy exactly as loaded is retained for debug and checkpointing */ bool aa_g_export_binary = IS_ENABLED(CONFIG_SECURITY_APPARMOR_EXPORT_BINARY); #ifdef CONFIG_SECURITY_APPARMOR_EXPORT_BINARY module_param_named(export_binary, aa_g_export_binary, aabool, 0600); #endif /* policy loaddata compression level */ int aa_g_rawdata_compression_level = AA_DEFAULT_CLEVEL; module_param_named(rawdata_compression_level, aa_g_rawdata_compression_level, aacompressionlevel, 0400); /* Debug mode */ bool aa_g_debug = IS_ENABLED(CONFIG_SECURITY_APPARMOR_DEBUG_MESSAGES); module_param_named(debug, aa_g_debug, aabool, S_IRUSR | S_IWUSR); /* Audit mode */ enum audit_mode aa_g_audit; module_param_call(audit, param_set_audit, param_get_audit, &aa_g_audit, S_IRUSR | S_IWUSR); /* Determines if audit header is included in audited messages. This * provides more context if the audit daemon is not running */ bool aa_g_audit_header = true; module_param_named(audit_header, aa_g_audit_header, aabool, S_IRUSR | S_IWUSR); /* lock out loading/removal of policy * TODO: add in at boot loading of policy, which is the only way to * load policy, if lock_policy is set */ bool aa_g_lock_policy; module_param_named(lock_policy, aa_g_lock_policy, aalockpolicy, S_IRUSR | S_IWUSR); /* Syscall logging mode */ bool aa_g_logsyscall; module_param_named(logsyscall, aa_g_logsyscall, aabool, S_IRUSR | S_IWUSR); /* Maximum pathname length before accesses will start getting rejected */ unsigned int aa_g_path_max = 2 * PATH_MAX; module_param_named(path_max, aa_g_path_max, aauint, S_IRUSR); /* Determines how paranoid loading of policy is and how much verification * on the loaded policy is done. * DEPRECATED: read only as strict checking of load is always done now * that none root users (user namespaces) can load policy. */ bool aa_g_paranoid_load = IS_ENABLED(CONFIG_SECURITY_APPARMOR_PARANOID_LOAD); module_param_named(paranoid_load, aa_g_paranoid_load, aabool, S_IRUGO); static int param_get_aaintbool(char *buffer, const struct kernel_param *kp); static int param_set_aaintbool(const char *val, const struct kernel_param *kp); #define param_check_aaintbool param_check_int static const struct kernel_param_ops param_ops_aaintbool = { .set = param_set_aaintbool, .get = param_get_aaintbool }; /* Boot time disable flag */ static int apparmor_enabled __ro_after_init = 1; module_param_named(enabled, apparmor_enabled, aaintbool, 0444); static int __init apparmor_enabled_setup(char *str) { unsigned long enabled; int error = kstrtoul(str, 0, &enabled); if (!error) apparmor_enabled = enabled ? 1 : 0; return 1; } __setup("apparmor=", apparmor_enabled_setup); /* set global flag turning off the ability to load policy */ static int param_set_aalockpolicy(const char *val, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_admin_capable(NULL)) return -EPERM; return param_set_bool(val, kp); } static int param_get_aalockpolicy(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return param_get_bool(buffer, kp); } static int param_set_aabool(const char *val, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_admin_capable(NULL)) return -EPERM; return param_set_bool(val, kp); } static int param_get_aabool(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return param_get_bool(buffer, kp); } static int param_set_aauint(const char *val, const struct kernel_param *kp) { int error; if (!apparmor_enabled) return -EINVAL; /* file is ro but enforce 2nd line check */ if (apparmor_initialized) return -EPERM; error = param_set_uint(val, kp); aa_g_path_max = max_t(uint32_t, aa_g_path_max, sizeof(union aa_buffer)); pr_info("AppArmor: buffer size set to %d bytes\n", aa_g_path_max); return error; } static int param_get_aauint(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return param_get_uint(buffer, kp); } /* Can only be set before AppArmor is initialized (i.e. on boot cmdline). */ static int param_set_aaintbool(const char *val, const struct kernel_param *kp) { struct kernel_param kp_local; bool value; int error; if (apparmor_initialized) return -EPERM; /* Create local copy, with arg pointing to bool type. */ value = !!*((int *)kp->arg); memcpy(&kp_local, kp, sizeof(kp_local)); kp_local.arg = &value; error = param_set_bool(val, &kp_local); if (!error) *((int *)kp->arg) = *((bool *)kp_local.arg); return error; } /* * To avoid changing /sys/module/apparmor/parameters/enabled from Y/N to * 1/0, this converts the "int that is actually bool" back to bool for * display in the /sys filesystem, while keeping it "int" for the LSM * infrastructure. */ static int param_get_aaintbool(char *buffer, const struct kernel_param *kp) { struct kernel_param kp_local; bool value; /* Create local copy, with arg pointing to bool type. */ value = !!*((int *)kp->arg); memcpy(&kp_local, kp, sizeof(kp_local)); kp_local.arg = &value; return param_get_bool(buffer, &kp_local); } static int param_set_aacompressionlevel(const char *val, const struct kernel_param *kp) { int error; if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized) return -EPERM; error = param_set_int(val, kp); aa_g_rawdata_compression_level = clamp(aa_g_rawdata_compression_level, AA_MIN_CLEVEL, AA_MAX_CLEVEL); pr_info("AppArmor: policy rawdata compression level set to %d\n", aa_g_rawdata_compression_level); return error; } static int param_get_aacompressionlevel(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return param_get_int(buffer, kp); } static int param_get_audit(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return sprintf(buffer, "%s", audit_mode_names[aa_g_audit]); } static int param_set_audit(const char *val, const struct kernel_param *kp) { int i; if (!apparmor_enabled) return -EINVAL; if (!val) return -EINVAL; if (apparmor_initialized && !aa_current_policy_admin_capable(NULL)) return -EPERM; i = match_string(audit_mode_names, AUDIT_MAX_INDEX, val); if (i < 0) return -EINVAL; aa_g_audit = i; return 0; } static int param_get_mode(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return sprintf(buffer, "%s", aa_profile_mode_names[aa_g_profile_mode]); } static int param_set_mode(const char *val, const struct kernel_param *kp) { int i; if (!apparmor_enabled) return -EINVAL; if (!val) return -EINVAL; if (apparmor_initialized && !aa_current_policy_admin_capable(NULL)) return -EPERM; i = match_string(aa_profile_mode_names, APPARMOR_MODE_NAMES_MAX_INDEX, val); if (i < 0) return -EINVAL; aa_g_profile_mode = i; return 0; } char *aa_get_buffer(bool in_atomic) { union aa_buffer *aa_buf; struct aa_local_cache *cache; bool try_again = true; gfp_t flags = (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); /* use per cpu cached buffers first */ cache = get_cpu_ptr(&aa_local_buffers); if (!list_empty(&cache->head)) { aa_buf = list_first_entry(&cache->head, union aa_buffer, list); list_del(&aa_buf->list); cache->hold--; cache->count--; put_cpu_ptr(&aa_local_buffers); return &aa_buf->buffer[0]; } put_cpu_ptr(&aa_local_buffers); if (!spin_trylock(&aa_buffers_lock)) { cache = get_cpu_ptr(&aa_local_buffers); cache->hold += 1; put_cpu_ptr(&aa_local_buffers); spin_lock(&aa_buffers_lock); } else { cache = get_cpu_ptr(&aa_local_buffers); put_cpu_ptr(&aa_local_buffers); } retry: if (buffer_count > reserve_count || (in_atomic && !list_empty(&aa_global_buffers))) { aa_buf = list_first_entry(&aa_global_buffers, union aa_buffer, list); list_del(&aa_buf->list); buffer_count--; spin_unlock(&aa_buffers_lock); return aa_buf->buffer; } if (in_atomic) { /* * out of reserve buffers and in atomic context so increase * how many buffers to keep in reserve */ reserve_count++; flags = GFP_ATOMIC; } spin_unlock(&aa_buffers_lock); if (!in_atomic) might_sleep(); aa_buf = kmalloc(aa_g_path_max, flags); if (!aa_buf) { if (try_again) { try_again = false; spin_lock(&aa_buffers_lock); goto retry; } pr_warn_once("AppArmor: Failed to allocate a memory buffer.\n"); return NULL; } return aa_buf->buffer; } void aa_put_buffer(char *buf) { union aa_buffer *aa_buf; struct aa_local_cache *cache; if (!buf) return; aa_buf = container_of(buf, union aa_buffer, buffer[0]); cache = get_cpu_ptr(&aa_local_buffers); if (!cache->hold) { put_cpu_ptr(&aa_local_buffers); if (spin_trylock(&aa_buffers_lock)) { /* put back on global list */ list_add(&aa_buf->list, &aa_global_buffers); buffer_count++; spin_unlock(&aa_buffers_lock); cache = get_cpu_ptr(&aa_local_buffers); put_cpu_ptr(&aa_local_buffers); return; } /* contention on global list, fallback to percpu */ cache = get_cpu_ptr(&aa_local_buffers); cache->hold += 1; } /* cache in percpu list */ list_add(&aa_buf->list, &cache->head); cache->count++; put_cpu_ptr(&aa_local_buffers); } /* * AppArmor init functions */ /** * set_init_ctx - set a task context and profile on the first task. * * TODO: allow setting an alternate profile than unconfined */ static int __init set_init_ctx(void) { struct cred *cred = (__force struct cred *)current->real_cred; set_cred_label(cred, aa_get_label(ns_unconfined(root_ns))); return 0; } static void destroy_buffers(void) { union aa_buffer *aa_buf; spin_lock(&aa_buffers_lock); while (!list_empty(&aa_global_buffers)) { aa_buf = list_first_entry(&aa_global_buffers, union aa_buffer, list); list_del(&aa_buf->list); spin_unlock(&aa_buffers_lock); kfree(aa_buf); spin_lock(&aa_buffers_lock); } spin_unlock(&aa_buffers_lock); } static int __init alloc_buffers(void) { union aa_buffer *aa_buf; int i, num; /* * per cpu set of cached allocated buffers used to help reduce * lock contention */ for_each_possible_cpu(i) { per_cpu(aa_local_buffers, i).hold = 0; per_cpu(aa_local_buffers, i).count = 0; INIT_LIST_HEAD(&per_cpu(aa_local_buffers, i).head); } /* * A function may require two buffers at once. Usually the buffers are * used for a short period of time and are shared. On UP kernel buffers * two should be enough, with more CPUs it is possible that more * buffers will be used simultaneously. The preallocated pool may grow. * This preallocation has also the side-effect that AppArmor will be * disabled early at boot if aa_g_path_max is extremly high. */ if (num_online_cpus() > 1) num = 4 + RESERVE_COUNT; else num = 2 + RESERVE_COUNT; for (i = 0; i < num; i++) { aa_buf = kmalloc(aa_g_path_max, GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); if (!aa_buf) { destroy_buffers(); return -ENOMEM; } aa_put_buffer(aa_buf->buffer); } return 0; } #ifdef CONFIG_SYSCTL static int apparmor_dointvec(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (!aa_current_policy_admin_capable(NULL)) return -EPERM; if (!apparmor_enabled) return -EINVAL; return proc_dointvec(table, write, buffer, lenp, ppos); } static struct ctl_table apparmor_sysctl_table[] = { #ifdef CONFIG_USER_NS { .procname = "unprivileged_userns_apparmor_policy", .data = &unprivileged_userns_apparmor_policy, .maxlen = sizeof(int), .mode = 0600, .proc_handler = apparmor_dointvec, }, #endif /* CONFIG_USER_NS */ { .procname = "apparmor_display_secid_mode", .data = &apparmor_display_secid_mode, .maxlen = sizeof(int), .mode = 0600, .proc_handler = apparmor_dointvec, }, { .procname = "apparmor_restrict_unprivileged_unconfined", .data = &aa_unprivileged_unconfined_restricted, .maxlen = sizeof(int), .mode = 0600, .proc_handler = apparmor_dointvec, }, { } }; static int __init apparmor_init_sysctl(void) { return register_sysctl("kernel", apparmor_sysctl_table) ? 0 : -ENOMEM; } #else static inline int apparmor_init_sysctl(void) { return 0; } #endif /* CONFIG_SYSCTL */ #if defined(CONFIG_NETFILTER) && defined(CONFIG_NETWORK_SECMARK) static unsigned int apparmor_ip_postroute(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct aa_sk_ctx *ctx; struct sock *sk; if (!skb->secmark) return NF_ACCEPT; sk = skb_to_full_sk(skb); if (sk == NULL) return NF_ACCEPT; ctx = aa_sock(sk); if (!apparmor_secmark_check(ctx->label, OP_SENDMSG, AA_MAY_SEND, skb->secmark, sk)) return NF_ACCEPT; return NF_DROP_ERR(-ECONNREFUSED); } static const struct nf_hook_ops apparmor_nf_ops[] = { { .hook = apparmor_ip_postroute, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_SELINUX_FIRST, }, #if IS_ENABLED(CONFIG_IPV6) { .hook = apparmor_ip_postroute, .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_SELINUX_FIRST, }, #endif }; static int __net_init apparmor_nf_register(struct net *net) { return nf_register_net_hooks(net, apparmor_nf_ops, ARRAY_SIZE(apparmor_nf_ops)); } static void __net_exit apparmor_nf_unregister(struct net *net) { nf_unregister_net_hooks(net, apparmor_nf_ops, ARRAY_SIZE(apparmor_nf_ops)); } static struct pernet_operations apparmor_net_ops = { .init = apparmor_nf_register, .exit = apparmor_nf_unregister, }; static int __init apparmor_nf_ip_init(void) { int err; if (!apparmor_enabled) return 0; err = register_pernet_subsys(&apparmor_net_ops); if (err) panic("Apparmor: register_pernet_subsys: error %d\n", err); return 0; } __initcall(apparmor_nf_ip_init); #endif static char nulldfa_src[] = { #include "nulldfa.in" }; static struct aa_dfa *nulldfa; static char stacksplitdfa_src[] = { #include "stacksplitdfa.in" }; struct aa_dfa *stacksplitdfa; struct aa_policydb *nullpdb; static int __init aa_setup_dfa_engine(void) { int error = -ENOMEM; nullpdb = aa_alloc_pdb(GFP_KERNEL); if (!nullpdb) return -ENOMEM; nulldfa = aa_dfa_unpack(nulldfa_src, sizeof(nulldfa_src), TO_ACCEPT1_FLAG(YYTD_DATA32) | TO_ACCEPT2_FLAG(YYTD_DATA32)); if (IS_ERR(nulldfa)) { error = PTR_ERR(nulldfa); goto fail; } nullpdb->dfa = aa_get_dfa(nulldfa); nullpdb->perms = kcalloc(2, sizeof(struct aa_perms), GFP_KERNEL); if (!nullpdb->perms) goto fail; nullpdb->size = 2; stacksplitdfa = aa_dfa_unpack(stacksplitdfa_src, sizeof(stacksplitdfa_src), TO_ACCEPT1_FLAG(YYTD_DATA32) | TO_ACCEPT2_FLAG(YYTD_DATA32)); if (IS_ERR(stacksplitdfa)) { error = PTR_ERR(stacksplitdfa); goto fail; } return 0; fail: aa_put_pdb(nullpdb); aa_put_dfa(nulldfa); nullpdb = NULL; nulldfa = NULL; stacksplitdfa = NULL; return error; } static void __init aa_teardown_dfa_engine(void) { aa_put_dfa(stacksplitdfa); aa_put_dfa(nulldfa); aa_put_pdb(nullpdb); nullpdb = NULL; stacksplitdfa = NULL; nulldfa = NULL; } static int __init apparmor_init(void) { int error; error = aa_setup_dfa_engine(); if (error) { AA_ERROR("Unable to setup dfa engine\n"); goto alloc_out; } error = aa_alloc_root_ns(); if (error) { AA_ERROR("Unable to allocate default profile namespace\n"); goto alloc_out; } error = apparmor_init_sysctl(); if (error) { AA_ERROR("Unable to register sysctls\n"); goto alloc_out; } error = alloc_buffers(); if (error) { AA_ERROR("Unable to allocate work buffers\n"); goto alloc_out; } error = set_init_ctx(); if (error) { AA_ERROR("Failed to set context on init task\n"); aa_free_root_ns(); goto buffers_out; } security_add_hooks(apparmor_hooks, ARRAY_SIZE(apparmor_hooks), &apparmor_lsmid); /* Report that AppArmor successfully initialized */ apparmor_initialized = 1; if (aa_g_profile_mode == APPARMOR_COMPLAIN) aa_info_message("AppArmor initialized: complain mode enabled"); else if (aa_g_profile_mode == APPARMOR_KILL) aa_info_message("AppArmor initialized: kill mode enabled"); else aa_info_message("AppArmor initialized"); return error; buffers_out: destroy_buffers(); alloc_out: aa_destroy_aafs(); aa_teardown_dfa_engine(); apparmor_enabled = false; return error; } DEFINE_LSM(apparmor) = { .name = "apparmor", .flags = LSM_FLAG_LEGACY_MAJOR | LSM_FLAG_EXCLUSIVE, .enabled = &apparmor_enabled, .blobs = &apparmor_blob_sizes, .init = apparmor_init, }; |
403 64 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Statically sized hash table implementation * (C) 2012 Sasha Levin <levinsasha928@gmail.com> */ #ifndef _LINUX_HASHTABLE_H #define _LINUX_HASHTABLE_H #include <linux/list.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/hash.h> #include <linux/rculist.h> #define DEFINE_HASHTABLE(name, bits) \ struct hlist_head name[1 << (bits)] = \ { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT } #define DEFINE_READ_MOSTLY_HASHTABLE(name, bits) \ struct hlist_head name[1 << (bits)] __read_mostly = \ { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT } #define DECLARE_HASHTABLE(name, bits) \ struct hlist_head name[1 << (bits)] #define HASH_SIZE(name) (ARRAY_SIZE(name)) #define HASH_BITS(name) ilog2(HASH_SIZE(name)) /* Use hash_32 when possible to allow for fast 32bit hashing in 64bit kernels. */ #define hash_min(val, bits) \ (sizeof(val) <= 4 ? hash_32(val, bits) : hash_long(val, bits)) static inline void __hash_init(struct hlist_head *ht, unsigned int sz) { unsigned int i; for (i = 0; i < sz; i++) INIT_HLIST_HEAD(&ht[i]); } /** * hash_init - initialize a hash table * @hashtable: hashtable to be initialized * * Calculates the size of the hashtable from the given parameter, otherwise * same as hash_init_size. * * This has to be a macro since HASH_BITS() will not work on pointers since * it calculates the size during preprocessing. */ #define hash_init(hashtable) __hash_init(hashtable, HASH_SIZE(hashtable)) /** * hash_add - add an object to a hashtable * @hashtable: hashtable to add to * @node: the &struct hlist_node of the object to be added * @key: the key of the object to be added */ #define hash_add(hashtable, node, key) \ hlist_add_head(node, &hashtable[hash_min(key, HASH_BITS(hashtable))]) /** * hash_add_rcu - add an object to a rcu enabled hashtable * @hashtable: hashtable to add to * @node: the &struct hlist_node of the object to be added * @key: the key of the object to be added */ #define hash_add_rcu(hashtable, node, key) \ hlist_add_head_rcu(node, &hashtable[hash_min(key, HASH_BITS(hashtable))]) /** * hash_hashed - check whether an object is in any hashtable * @node: the &struct hlist_node of the object to be checked */ static inline bool hash_hashed(struct hlist_node *node) { return !hlist_unhashed(node); } static inline bool __hash_empty(struct hlist_head *ht, unsigned int sz) { unsigned int i; for (i = 0; i < sz; i++) if (!hlist_empty(&ht[i])) return false; return true; } /** * hash_empty - check whether a hashtable is empty * @hashtable: hashtable to check * * This has to be a macro since HASH_BITS() will not work on pointers since * it calculates the size during preprocessing. */ #define hash_empty(hashtable) __hash_empty(hashtable, HASH_SIZE(hashtable)) /** * hash_del - remove an object from a hashtable * @node: &struct hlist_node of the object to remove */ static inline void hash_del(struct hlist_node *node) { hlist_del_init(node); } /** * hash_del_rcu - remove an object from a rcu enabled hashtable * @node: &struct hlist_node of the object to remove */ static inline void hash_del_rcu(struct hlist_node *node) { hlist_del_init_rcu(node); } /** * hash_for_each - iterate over a hashtable * @name: hashtable to iterate * @bkt: integer to use as bucket loop cursor * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct */ #define hash_for_each(name, bkt, obj, member) \ for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\ (bkt)++)\ hlist_for_each_entry(obj, &name[bkt], member) /** * hash_for_each_rcu - iterate over a rcu enabled hashtable * @name: hashtable to iterate * @bkt: integer to use as bucket loop cursor * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct */ #define hash_for_each_rcu(name, bkt, obj, member) \ for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\ (bkt)++)\ hlist_for_each_entry_rcu(obj, &name[bkt], member) /** * hash_for_each_safe - iterate over a hashtable safe against removal of * hash entry * @name: hashtable to iterate * @bkt: integer to use as bucket loop cursor * @tmp: a &struct hlist_node used for temporary storage * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct */ #define hash_for_each_safe(name, bkt, tmp, obj, member) \ for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\ (bkt)++)\ hlist_for_each_entry_safe(obj, tmp, &name[bkt], member) /** * hash_for_each_possible - iterate over all possible objects hashing to the * same bucket * @name: hashtable to iterate * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct * @key: the key of the objects to iterate over */ #define hash_for_each_possible(name, obj, member, key) \ hlist_for_each_entry(obj, &name[hash_min(key, HASH_BITS(name))], member) /** * hash_for_each_possible_rcu - iterate over all possible objects hashing to the * same bucket in an rcu enabled hashtable * @name: hashtable to iterate * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct * @key: the key of the objects to iterate over */ #define hash_for_each_possible_rcu(name, obj, member, key, cond...) \ hlist_for_each_entry_rcu(obj, &name[hash_min(key, HASH_BITS(name))],\ member, ## cond) /** * hash_for_each_possible_rcu_notrace - iterate over all possible objects hashing * to the same bucket in an rcu enabled hashtable in a rcu enabled hashtable * @name: hashtable to iterate * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct * @key: the key of the objects to iterate over * * This is the same as hash_for_each_possible_rcu() except that it does * not do any RCU debugging or tracing. */ #define hash_for_each_possible_rcu_notrace(name, obj, member, key) \ hlist_for_each_entry_rcu_notrace(obj, \ &name[hash_min(key, HASH_BITS(name))], member) /** * hash_for_each_possible_safe - iterate over all possible objects hashing to the * same bucket safe against removals * @name: hashtable to iterate * @obj: the type * to use as a loop cursor for each entry * @tmp: a &struct hlist_node used for temporary storage * @member: the name of the hlist_node within the struct * @key: the key of the objects to iterate over */ #define hash_for_each_possible_safe(name, obj, tmp, member, key) \ hlist_for_each_entry_safe(obj, tmp,\ &name[hash_min(key, HASH_BITS(name))], member) #endif |
31 264 3 3244 11728 2603 6101 7289 8827 12 14 52 45 90 301 6 8 11994 3 10 5 543 11 7 10 11 23 6 23 18 1249 43 2 1066 1408 7 24 4 12 2 90 30 10 9 407 11 12 28 100 168 2378 228 73 1 4 62 61 335 334 29 69 268 268 11619 8010 4 22 724 1 32 8571 1 9059 9062 5156 5392 129 13 100 5438 18 495 2 6 7203 1 6 1501 267 1 49 419 13 27 305 665 128 857 9 823 203 270 3 1121 1121 27 76 3 527 3 5 39 3 3 661 1317 2849 8985 9 1892 749 9 733 701 60 37 278 282 2 51 49 1 24 78 1 122 11 118 2 2 4 161 54 2 154 22 1 222 14 22 14 2 11 12 11 639 26 7876 2 60 55 305 244 445 7846 1 4 827 179 382 797 19 2 286 290 23 1 9 14 5 2 2 7 3 273 15 44 11 334 9 8 11755 29 9055 14 87 87 9 8599 13 171 12 75 810 9063 87 87 8184 1 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Definitions for the 'struct sk_buff' memory handlers. * * Authors: * Alan Cox, <gw4pts@gw4pts.ampr.org> * Florian La Roche, <rzsfl@rz.uni-sb.de> */ #ifndef _LINUX_SKBUFF_H #define _LINUX_SKBUFF_H #include <linux/kernel.h> #include <linux/compiler.h> #include <linux/time.h> #include <linux/bug.h> #include <linux/bvec.h> #include <linux/cache.h> #include <linux/rbtree.h> #include <linux/socket.h> #include <linux/refcount.h> #include <linux/atomic.h> #include <asm/types.h> #include <linux/spinlock.h> #include <net/checksum.h> #include <linux/rcupdate.h> #include <linux/dma-mapping.h> #include <linux/netdev_features.h> #include <net/flow_dissector.h> #include <linux/in6.h> #include <linux/if_packet.h> #include <linux/llist.h> #include <net/flow.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <linux/netfilter/nf_conntrack_common.h> #endif #include <net/net_debug.h> #include <net/dropreason-core.h> #include <net/netmem.h> /** * DOC: skb checksums * * The interface for checksum offload between the stack and networking drivers * is as follows... * * IP checksum related features * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * Drivers advertise checksum offload capabilities in the features of a device. * From the stack's point of view these are capabilities offered by the driver. * A driver typically only advertises features that it is capable of offloading * to its device. * * .. flat-table:: Checksum related device features * :widths: 1 10 * * * - %NETIF_F_HW_CSUM * - The driver (or its device) is able to compute one * IP (one's complement) checksum for any combination * of protocols or protocol layering. The checksum is * computed and set in a packet per the CHECKSUM_PARTIAL * interface (see below). * * * - %NETIF_F_IP_CSUM * - Driver (device) is only able to checksum plain * TCP or UDP packets over IPv4. These are specifically * unencapsulated packets of the form IPv4|TCP or * IPv4|UDP where the Protocol field in the IPv4 header * is TCP or UDP. The IPv4 header may contain IP options. * This feature cannot be set in features for a device * with NETIF_F_HW_CSUM also set. This feature is being * DEPRECATED (see below). * * * - %NETIF_F_IPV6_CSUM * - Driver (device) is only able to checksum plain * TCP or UDP packets over IPv6. These are specifically * unencapsulated packets of the form IPv6|TCP or * IPv6|UDP where the Next Header field in the IPv6 * header is either TCP or UDP. IPv6 extension headers * are not supported with this feature. This feature * cannot be set in features for a device with * NETIF_F_HW_CSUM also set. This feature is being * DEPRECATED (see below). * * * - %NETIF_F_RXCSUM * - Driver (device) performs receive checksum offload. * This flag is only used to disable the RX checksum * feature for a device. The stack will accept receive * checksum indication in packets received on a device * regardless of whether NETIF_F_RXCSUM is set. * * Checksumming of received packets by device * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * Indication of checksum verification is set in &sk_buff.ip_summed. * Possible values are: * * - %CHECKSUM_NONE * * Device did not checksum this packet e.g. due to lack of capabilities. * The packet contains full (though not verified) checksum in packet but * not in skb->csum. Thus, skb->csum is undefined in this case. * * - %CHECKSUM_UNNECESSARY * * The hardware you're dealing with doesn't calculate the full checksum * (as in %CHECKSUM_COMPLETE), but it does parse headers and verify checksums * for specific protocols. For such packets it will set %CHECKSUM_UNNECESSARY * if their checksums are okay. &sk_buff.csum is still undefined in this case * though. A driver or device must never modify the checksum field in the * packet even if checksum is verified. * * %CHECKSUM_UNNECESSARY is applicable to following protocols: * * - TCP: IPv6 and IPv4. * - UDP: IPv4 and IPv6. A device may apply CHECKSUM_UNNECESSARY to a * zero UDP checksum for either IPv4 or IPv6, the networking stack * may perform further validation in this case. * - GRE: only if the checksum is present in the header. * - SCTP: indicates the CRC in SCTP header has been validated. * - FCOE: indicates the CRC in FC frame has been validated. * * &sk_buff.csum_level indicates the number of consecutive checksums found in * the packet minus one that have been verified as %CHECKSUM_UNNECESSARY. * For instance if a device receives an IPv6->UDP->GRE->IPv4->TCP packet * and a device is able to verify the checksums for UDP (possibly zero), * GRE (checksum flag is set) and TCP, &sk_buff.csum_level would be set to * two. If the device were only able to verify the UDP checksum and not * GRE, either because it doesn't support GRE checksum or because GRE * checksum is bad, skb->csum_level would be set to zero (TCP checksum is * not considered in this case). * * - %CHECKSUM_COMPLETE * * This is the most generic way. The device supplied checksum of the _whole_ * packet as seen by netif_rx() and fills in &sk_buff.csum. This means the * hardware doesn't need to parse L3/L4 headers to implement this. * * Notes: * * - Even if device supports only some protocols, but is able to produce * skb->csum, it MUST use CHECKSUM_COMPLETE, not CHECKSUM_UNNECESSARY. * - CHECKSUM_COMPLETE is not applicable to SCTP and FCoE protocols. * * - %CHECKSUM_PARTIAL * * A checksum is set up to be offloaded to a device as described in the * output description for CHECKSUM_PARTIAL. This may occur on a packet * received directly from another Linux OS, e.g., a virtualized Linux kernel * on the same host, or it may be set in the input path in GRO or remote * checksum offload. For the purposes of checksum verification, the checksum * referred to by skb->csum_start + skb->csum_offset and any preceding * checksums in the packet are considered verified. Any checksums in the * packet that are after the checksum being offloaded are not considered to * be verified. * * Checksumming on transmit for non-GSO * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * The stack requests checksum offload in the &sk_buff.ip_summed for a packet. * Values are: * * - %CHECKSUM_PARTIAL * * The driver is required to checksum the packet as seen by hard_start_xmit() * from &sk_buff.csum_start up to the end, and to record/write the checksum at * offset &sk_buff.csum_start + &sk_buff.csum_offset. * A driver may verify that the * csum_start and csum_offset values are valid values given the length and * offset of the packet, but it should not attempt to validate that the * checksum refers to a legitimate transport layer checksum -- it is the * purview of the stack to validate that csum_start and csum_offset are set * correctly. * * When the stack requests checksum offload for a packet, the driver MUST * ensure that the checksum is set correctly. A driver can either offload the * checksum calculation to the device, or call skb_checksum_help (in the case * that the device does not support offload for a particular checksum). * * %NETIF_F_IP_CSUM and %NETIF_F_IPV6_CSUM are being deprecated in favor of * %NETIF_F_HW_CSUM. New devices should use %NETIF_F_HW_CSUM to indicate * checksum offload capability. * skb_csum_hwoffload_help() can be called to resolve %CHECKSUM_PARTIAL based * on network device checksumming capabilities: if a packet does not match * them, skb_checksum_help() or skb_crc32c_help() (depending on the value of * &sk_buff.csum_not_inet, see :ref:`crc`) * is called to resolve the checksum. * * - %CHECKSUM_NONE * * The skb was already checksummed by the protocol, or a checksum is not * required. * * - %CHECKSUM_UNNECESSARY * * This has the same meaning as CHECKSUM_NONE for checksum offload on * output. * * - %CHECKSUM_COMPLETE * * Not used in checksum output. If a driver observes a packet with this value * set in skbuff, it should treat the packet as if %CHECKSUM_NONE were set. * * .. _crc: * * Non-IP checksum (CRC) offloads * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * .. flat-table:: * :widths: 1 10 * * * - %NETIF_F_SCTP_CRC * - This feature indicates that a device is capable of * offloading the SCTP CRC in a packet. To perform this offload the stack * will set csum_start and csum_offset accordingly, set ip_summed to * %CHECKSUM_PARTIAL and set csum_not_inet to 1, to provide an indication * in the skbuff that the %CHECKSUM_PARTIAL refers to CRC32c. * A driver that supports both IP checksum offload and SCTP CRC32c offload * must verify which offload is configured for a packet by testing the * value of &sk_buff.csum_not_inet; skb_crc32c_csum_help() is provided to * resolve %CHECKSUM_PARTIAL on skbs where csum_not_inet is set to 1. * * * - %NETIF_F_FCOE_CRC * - This feature indicates that a device is capable of offloading the FCOE * CRC in a packet. To perform this offload the stack will set ip_summed * to %CHECKSUM_PARTIAL and set csum_start and csum_offset * accordingly. Note that there is no indication in the skbuff that the * %CHECKSUM_PARTIAL refers to an FCOE checksum, so a driver that supports * both IP checksum offload and FCOE CRC offload must verify which offload * is configured for a packet, presumably by inspecting packet headers. * * Checksumming on output with GSO * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * In the case of a GSO packet (skb_is_gso() is true), checksum offload * is implied by the SKB_GSO_* flags in gso_type. Most obviously, if the * gso_type is %SKB_GSO_TCPV4 or %SKB_GSO_TCPV6, TCP checksum offload as * part of the GSO operation is implied. If a checksum is being offloaded * with GSO then ip_summed is %CHECKSUM_PARTIAL, and both csum_start and * csum_offset are set to refer to the outermost checksum being offloaded * (two offloaded checksums are possible with UDP encapsulation). */ /* Don't change this without changing skb_csum_unnecessary! */ #define CHECKSUM_NONE 0 #define CHECKSUM_UNNECESSARY 1 #define CHECKSUM_COMPLETE 2 #define CHECKSUM_PARTIAL 3 /* Maximum value in skb->csum_level */ #define SKB_MAX_CSUM_LEVEL 3 #define SKB_DATA_ALIGN(X) ALIGN(X, SMP_CACHE_BYTES) #define SKB_WITH_OVERHEAD(X) \ ((X) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) /* For X bytes available in skb->head, what is the minimal * allocation needed, knowing struct skb_shared_info needs * to be aligned. */ #define SKB_HEAD_ALIGN(X) (SKB_DATA_ALIGN(X) + \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) #define SKB_MAX_ORDER(X, ORDER) \ SKB_WITH_OVERHEAD((PAGE_SIZE << (ORDER)) - (X)) #define SKB_MAX_HEAD(X) (SKB_MAX_ORDER((X), 0)) #define SKB_MAX_ALLOC (SKB_MAX_ORDER(0, 2)) /* return minimum truesize of one skb containing X bytes of data */ #define SKB_TRUESIZE(X) ((X) + \ SKB_DATA_ALIGN(sizeof(struct sk_buff)) + \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) struct ahash_request; struct net_device; struct scatterlist; struct pipe_inode_info; struct iov_iter; struct napi_struct; struct bpf_prog; union bpf_attr; struct skb_ext; struct ts_config; #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) struct nf_bridge_info { enum { BRNF_PROTO_UNCHANGED, BRNF_PROTO_8021Q, BRNF_PROTO_PPPOE } orig_proto:8; u8 pkt_otherhost:1; u8 in_prerouting:1; u8 bridged_dnat:1; u8 sabotage_in_done:1; __u16 frag_max_size; int physinif; /* always valid & non-NULL from FORWARD on, for physdev match */ struct net_device *physoutdev; union { /* prerouting: detect dnat in orig/reply direction */ __be32 ipv4_daddr; struct in6_addr ipv6_daddr; /* after prerouting + nat detected: store original source * mac since neigh resolution overwrites it, only used while * skb is out in neigh layer. */ char neigh_header[8]; }; }; #endif #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) /* Chain in tc_skb_ext will be used to share the tc chain with * ovs recirc_id. It will be set to the current chain by tc * and read by ovs to recirc_id. */ struct tc_skb_ext { union { u64 act_miss_cookie; __u32 chain; }; __u16 mru; __u16 zone; u8 post_ct:1; u8 post_ct_snat:1; u8 post_ct_dnat:1; u8 act_miss:1; /* Set if act_miss_cookie is used */ u8 l2_miss:1; /* Set by bridge upon FDB or MDB miss */ }; #endif struct sk_buff_head { /* These two members must be first to match sk_buff. */ struct_group_tagged(sk_buff_list, list, struct sk_buff *next; struct sk_buff *prev; ); __u32 qlen; spinlock_t lock; }; struct sk_buff; #ifndef CONFIG_MAX_SKB_FRAGS # define CONFIG_MAX_SKB_FRAGS 17 #endif #define MAX_SKB_FRAGS CONFIG_MAX_SKB_FRAGS extern int sysctl_max_skb_frags; /* Set skb_shinfo(skb)->gso_size to this in case you want skb_segment to * segment using its current segmentation instead. */ #define GSO_BY_FRAGS 0xFFFF typedef struct skb_frag { netmem_ref netmem; unsigned int len; unsigned int offset; } skb_frag_t; /** * skb_frag_size() - Returns the size of a skb fragment * @frag: skb fragment */ static inline unsigned int skb_frag_size(const skb_frag_t *frag) { return frag->len; } /** * skb_frag_size_set() - Sets the size of a skb fragment * @frag: skb fragment * @size: size of fragment */ static inline void skb_frag_size_set(skb_frag_t *frag, unsigned int size) { frag->len = size; } /** * skb_frag_size_add() - Increments the size of a skb fragment by @delta * @frag: skb fragment * @delta: value to add */ static inline void skb_frag_size_add(skb_frag_t *frag, int delta) { frag->len += delta; } /** * skb_frag_size_sub() - Decrements the size of a skb fragment by @delta * @frag: skb fragment * @delta: value to subtract */ static inline void skb_frag_size_sub(skb_frag_t *frag, int delta) { frag->len -= delta; } /** * skb_frag_must_loop - Test if %p is a high memory page * @p: fragment's page */ static inline bool skb_frag_must_loop(struct page *p) { #if defined(CONFIG_HIGHMEM) if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP) || PageHighMem(p)) return true; #endif return false; } /** * skb_frag_foreach_page - loop over pages in a fragment * * @f: skb frag to operate on * @f_off: offset from start of f->netmem * @f_len: length from f_off to loop over * @p: (temp var) current page * @p_off: (temp var) offset from start of current page, * non-zero only on first page. * @p_len: (temp var) length in current page, * < PAGE_SIZE only on first and last page. * @copied: (temp var) length so far, excluding current p_len. * * A fragment can hold a compound page, in which case per-page * operations, notably kmap_atomic, must be called for each * regular page. */ #define skb_frag_foreach_page(f, f_off, f_len, p, p_off, p_len, copied) \ for (p = skb_frag_page(f) + ((f_off) >> PAGE_SHIFT), \ p_off = (f_off) & (PAGE_SIZE - 1), \ p_len = skb_frag_must_loop(p) ? \ min_t(u32, f_len, PAGE_SIZE - p_off) : f_len, \ copied = 0; \ copied < f_len; \ copied += p_len, p++, p_off = 0, \ p_len = min_t(u32, f_len - copied, PAGE_SIZE)) \ /** * struct skb_shared_hwtstamps - hardware time stamps * @hwtstamp: hardware time stamp transformed into duration * since arbitrary point in time * @netdev_data: address/cookie of network device driver used as * reference to actual hardware time stamp * * Software time stamps generated by ktime_get_real() are stored in * skb->tstamp. * * hwtstamps can only be compared against other hwtstamps from * the same device. * * This structure is attached to packets as part of the * &skb_shared_info. Use skb_hwtstamps() to get a pointer. */ struct skb_shared_hwtstamps { union { ktime_t hwtstamp; void *netdev_data; }; }; /* Definitions for tx_flags in struct skb_shared_info */ enum { /* generate hardware time stamp */ SKBTX_HW_TSTAMP = 1 << 0, /* generate software time stamp when queueing packet to NIC */ SKBTX_SW_TSTAMP = 1 << 1, /* device driver is going to provide hardware time stamp */ SKBTX_IN_PROGRESS = 1 << 2, /* generate hardware time stamp based on cycles if supported */ SKBTX_HW_TSTAMP_USE_CYCLES = 1 << 3, /* generate wifi status information (where possible) */ SKBTX_WIFI_STATUS = 1 << 4, /* determine hardware time stamp based on time or cycles */ SKBTX_HW_TSTAMP_NETDEV = 1 << 5, /* generate software time stamp when entering packet scheduling */ SKBTX_SCHED_TSTAMP = 1 << 6, }; #define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | \ SKBTX_SCHED_TSTAMP) #define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | \ SKBTX_HW_TSTAMP_USE_CYCLES | \ SKBTX_ANY_SW_TSTAMP) /* Definitions for flags in struct skb_shared_info */ enum { /* use zcopy routines */ SKBFL_ZEROCOPY_ENABLE = BIT(0), /* This indicates at least one fragment might be overwritten * (as in vmsplice(), sendfile() ...) * If we need to compute a TX checksum, we'll need to copy * all frags to avoid possible bad checksum */ SKBFL_SHARED_FRAG = BIT(1), /* segment contains only zerocopy data and should not be * charged to the kernel memory. */ SKBFL_PURE_ZEROCOPY = BIT(2), SKBFL_DONT_ORPHAN = BIT(3), /* page references are managed by the ubuf_info, so it's safe to * use frags only up until ubuf_info is released */ SKBFL_MANAGED_FRAG_REFS = BIT(4), }; #define SKBFL_ZEROCOPY_FRAG (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG) #define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY | \ SKBFL_DONT_ORPHAN | SKBFL_MANAGED_FRAG_REFS) /* * The callback notifies userspace to release buffers when skb DMA is done in * lower device, the skb last reference should be 0 when calling this. * The zerocopy_success argument is true if zero copy transmit occurred, * false on data copy or out of memory error caused by data copy attempt. * The ctx field is used to track device context. * The desc field is used to track userspace buffer index. */ struct ubuf_info { void (*callback)(struct sk_buff *, struct ubuf_info *, bool zerocopy_success); refcount_t refcnt; u8 flags; }; struct ubuf_info_msgzc { struct ubuf_info ubuf; union { struct { unsigned long desc; void *ctx; }; struct { u32 id; u16 len; u16 zerocopy:1; u32 bytelen; }; }; struct mmpin { struct user_struct *user; unsigned int num_pg; } mmp; }; #define skb_uarg(SKB) ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg)) #define uarg_to_msgzc(ubuf_ptr) container_of((ubuf_ptr), struct ubuf_info_msgzc, \ ubuf) int mm_account_pinned_pages(struct mmpin *mmp, size_t size); void mm_unaccount_pinned_pages(struct mmpin *mmp); /* Preserve some data across TX submission and completion. * * Note, this state is stored in the driver. Extending the layout * might need some special care. */ struct xsk_tx_metadata_compl { __u64 *tx_timestamp; }; /* This data is invariant across clones and lives at * the end of the header data, ie. at skb->end. */ struct skb_shared_info { __u8 flags; __u8 meta_len; __u8 nr_frags; __u8 tx_flags; unsigned short gso_size; /* Warning: this field is not always filled in (UFO)! */ unsigned short gso_segs; struct sk_buff *frag_list; union { struct skb_shared_hwtstamps hwtstamps; struct xsk_tx_metadata_compl xsk_meta; }; unsigned int gso_type; u32 tskey; /* * Warning : all fields before dataref are cleared in __alloc_skb() */ atomic_t dataref; unsigned int xdp_frags_size; /* Intermediate layers must ensure that destructor_arg * remains valid until skb destructor */ void * destructor_arg; /* must be last field, see pskb_expand_head() */ skb_frag_t frags[MAX_SKB_FRAGS]; }; /** * DOC: dataref and headerless skbs * * Transport layers send out clones of payload skbs they hold for * retransmissions. To allow lower layers of the stack to prepend their headers * we split &skb_shared_info.dataref into two halves. * The lower 16 bits count the overall number of references. * The higher 16 bits indicate how many of the references are payload-only. * skb_header_cloned() checks if skb is allowed to add / write the headers. * * The creator of the skb (e.g. TCP) marks its skb as &sk_buff.nohdr * (via __skb_header_release()). Any clone created from marked skb will get * &sk_buff.hdr_len populated with the available headroom. * If there's the only clone in existence it's able to modify the headroom * at will. The sequence of calls inside the transport layer is:: * * <alloc skb> * skb_reserve() * __skb_header_release() * skb_clone() * // send the clone down the stack * * This is not a very generic construct and it depends on the transport layers * doing the right thing. In practice there's usually only one payload-only skb. * Having multiple payload-only skbs with different lengths of hdr_len is not * possible. The payload-only skbs should never leave their owner. */ #define SKB_DATAREF_SHIFT 16 #define SKB_DATAREF_MASK ((1 << SKB_DATAREF_SHIFT) - 1) enum { SKB_FCLONE_UNAVAILABLE, /* skb has no fclone (from head_cache) */ SKB_FCLONE_ORIG, /* orig skb (from fclone_cache) */ SKB_FCLONE_CLONE, /* companion fclone skb (from fclone_cache) */ }; enum { SKB_GSO_TCPV4 = 1 << 0, /* This indicates the skb is from an untrusted source. */ SKB_GSO_DODGY = 1 << 1, /* This indicates the tcp segment has CWR set. */ SKB_GSO_TCP_ECN = 1 << 2, SKB_GSO_TCP_FIXEDID = 1 << 3, SKB_GSO_TCPV6 = 1 << 4, SKB_GSO_FCOE = 1 << 5, SKB_GSO_GRE = 1 << 6, SKB_GSO_GRE_CSUM = 1 << 7, SKB_GSO_IPXIP4 = 1 << 8, SKB_GSO_IPXIP6 = 1 << 9, SKB_GSO_UDP_TUNNEL = 1 << 10, SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11, SKB_GSO_PARTIAL = 1 << 12, SKB_GSO_TUNNEL_REMCSUM = 1 << 13, SKB_GSO_SCTP = 1 << 14, SKB_GSO_ESP = 1 << 15, SKB_GSO_UDP = 1 << 16, SKB_GSO_UDP_L4 = 1 << 17, SKB_GSO_FRAGLIST = 1 << 18, }; #if BITS_PER_LONG > 32 #define NET_SKBUFF_DATA_USES_OFFSET 1 #endif #ifdef NET_SKBUFF_DATA_USES_OFFSET typedef unsigned int sk_buff_data_t; #else typedef unsigned char *sk_buff_data_t; #endif /** * DOC: Basic sk_buff geometry * * struct sk_buff itself is a metadata structure and does not hold any packet * data. All the data is held in associated buffers. * * &sk_buff.head points to the main "head" buffer. The head buffer is divided * into two parts: * * - data buffer, containing headers and sometimes payload; * this is the part of the skb operated on by the common helpers * such as skb_put() or skb_pull(); * - shared info (struct skb_shared_info) which holds an array of pointers * to read-only data in the (page, offset, length) format. * * Optionally &skb_shared_info.frag_list may point to another skb. * * Basic diagram may look like this:: * * --------------- * | sk_buff | * --------------- * ,--------------------------- + head * / ,----------------- + data * / / ,----------- + tail * | | | , + end * | | | | * v v v v * ----------------------------------------------- * | headroom | data | tailroom | skb_shared_info | * ----------------------------------------------- * + [page frag] * + [page frag] * + [page frag] * + [page frag] --------- * + frag_list --> | sk_buff | * --------- * */ /** * struct sk_buff - socket buffer * @next: Next buffer in list * @prev: Previous buffer in list * @tstamp: Time we arrived/left * @skb_mstamp_ns: (aka @tstamp) earliest departure time; start point * for retransmit timer * @rbnode: RB tree node, alternative to next/prev for netem/tcp * @list: queue head * @ll_node: anchor in an llist (eg socket defer_list) * @sk: Socket we are owned by * @ip_defrag_offset: (aka @sk) alternate use of @sk, used in * fragmentation management * @dev: Device we arrived on/are leaving by * @dev_scratch: (aka @dev) alternate use of @dev when @dev would be %NULL * @cb: Control buffer. Free for use by every layer. Put private vars here * @_skb_refdst: destination entry (with norefcount bit) * @len: Length of actual data * @data_len: Data length * @mac_len: Length of link layer header * @hdr_len: writable header length of cloned skb * @csum: Checksum (must include start/offset pair) * @csum_start: Offset from skb->head where checksumming should start * @csum_offset: Offset from csum_start where checksum should be stored * @priority: Packet queueing priority * @ignore_df: allow local fragmentation * @cloned: Head may be cloned (check refcnt to be sure) * @ip_summed: Driver fed us an IP checksum * @nohdr: Payload reference only, must not modify header * @pkt_type: Packet class * @fclone: skbuff clone status * @ipvs_property: skbuff is owned by ipvs * @inner_protocol_type: whether the inner protocol is * ENCAP_TYPE_ETHER or ENCAP_TYPE_IPPROTO * @remcsum_offload: remote checksum offload is enabled * @offload_fwd_mark: Packet was L2-forwarded in hardware * @offload_l3_fwd_mark: Packet was L3-forwarded in hardware * @tc_skip_classify: do not classify packet. set by IFB device * @tc_at_ingress: used within tc_classify to distinguish in/egress * @redirected: packet was redirected by packet classifier * @from_ingress: packet was redirected from the ingress path * @nf_skip_egress: packet shall skip nf egress - see netfilter_netdev.h * @peeked: this packet has been seen already, so stats have been * done for it, don't do them again * @nf_trace: netfilter packet trace flag * @protocol: Packet protocol from driver * @destructor: Destruct function * @tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue) * @_sk_redir: socket redirection information for skmsg * @_nfct: Associated connection, if any (with nfctinfo bits) * @skb_iif: ifindex of device we arrived on * @tc_index: Traffic control index * @hash: the packet hash * @queue_mapping: Queue mapping for multiqueue devices * @head_frag: skb was allocated from page fragments, * not allocated by kmalloc() or vmalloc(). * @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves * @pp_recycle: mark the packet for recycling instead of freeing (implies * page_pool support on driver) * @active_extensions: active extensions (skb_ext_id types) * @ndisc_nodetype: router type (from link layer) * @ooo_okay: allow the mapping of a socket to a queue to be changed * @l4_hash: indicate hash is a canonical 4-tuple hash over transport * ports. * @sw_hash: indicates hash was computed in software stack * @wifi_acked_valid: wifi_acked was set * @wifi_acked: whether frame was acked on wifi or not * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS * @encapsulation: indicates the inner headers in the skbuff are valid * @encap_hdr_csum: software checksum is needed * @csum_valid: checksum is already valid * @csum_not_inet: use CRC32c to resolve CHECKSUM_PARTIAL * @csum_complete_sw: checksum was completed by software * @csum_level: indicates the number of consecutive checksums found in * the packet minus one that have been verified as * CHECKSUM_UNNECESSARY (max 3) * @dst_pending_confirm: need to confirm neighbour * @decrypted: Decrypted SKB * @slow_gro: state present at GRO time, slower prepare step required * @mono_delivery_time: When set, skb->tstamp has the * delivery_time in mono clock base (i.e. EDT). Otherwise, the * skb->tstamp has the (rcv) timestamp at ingress and * delivery_time at egress. * @napi_id: id of the NAPI struct this skb came from * @sender_cpu: (aka @napi_id) source CPU in XPS * @alloc_cpu: CPU which did the skb allocation. * @secmark: security marking * @mark: Generic packet mark * @reserved_tailroom: (aka @mark) number of bytes of free space available * at the tail of an sk_buff * @vlan_all: vlan fields (proto & tci) * @vlan_proto: vlan encapsulation protocol * @vlan_tci: vlan tag control information * @inner_protocol: Protocol (encapsulation) * @inner_ipproto: (aka @inner_protocol) stores ipproto when * skb->inner_protocol_type == ENCAP_TYPE_IPPROTO; * @inner_transport_header: Inner transport layer header (encapsulation) * @inner_network_header: Network layer header (encapsulation) * @inner_mac_header: Link layer header (encapsulation) * @transport_header: Transport layer header * @network_header: Network layer header * @mac_header: Link layer header * @kcov_handle: KCOV remote handle for remote coverage collection * @tail: Tail pointer * @end: End pointer * @head: Head of buffer * @data: Data head pointer * @truesize: Buffer size * @users: User count - see {datagram,tcp}.c * @extensions: allocated extensions, valid if active_extensions is nonzero */ struct sk_buff { union { struct { /* These two members must be first to match sk_buff_head. */ struct sk_buff *next; struct sk_buff *prev; union { struct net_device *dev; /* Some protocols might use this space to store information, * while device pointer would be NULL. * UDP receive path is one user. */ unsigned long dev_scratch; }; }; struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */ struct list_head list; struct llist_node ll_node; }; union { struct sock *sk; int ip_defrag_offset; }; union { ktime_t tstamp; u64 skb_mstamp_ns; /* earliest departure time */ }; /* * This is the control buffer. It is free to use for every * layer. Please put your private variables there. If you * want to keep them across layers you have to do a skb_clone() * first. This is owned by whoever has the skb queued ATM. */ char cb[48] __aligned(8); union { struct { unsigned long _skb_refdst; void (*destructor)(struct sk_buff *skb); }; struct list_head tcp_tsorted_anchor; #ifdef CONFIG_NET_SOCK_MSG unsigned long _sk_redir; #endif }; #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) unsigned long _nfct; #endif unsigned int len, data_len; __u16 mac_len, hdr_len; /* Following fields are _not_ copied in __copy_skb_header() * Note that queue_mapping is here mostly to fill a hole. */ __u16 queue_mapping; /* if you move cloned around you also must adapt those constants */ #ifdef __BIG_ENDIAN_BITFIELD #define CLONED_MASK (1 << 7) #else #define CLONED_MASK 1 #endif #define CLONED_OFFSET offsetof(struct sk_buff, __cloned_offset) /* private: */ __u8 __cloned_offset[0]; /* public: */ __u8 cloned:1, nohdr:1, fclone:2, peeked:1, head_frag:1, pfmemalloc:1, pp_recycle:1; /* page_pool recycle indicator */ #ifdef CONFIG_SKB_EXTENSIONS __u8 active_extensions; #endif /* Fields enclosed in headers group are copied * using a single memcpy() in __copy_skb_header() */ struct_group(headers, /* private: */ __u8 __pkt_type_offset[0]; /* public: */ __u8 pkt_type:3; /* see PKT_TYPE_MAX */ __u8 ignore_df:1; __u8 dst_pending_confirm:1; __u8 ip_summed:2; __u8 ooo_okay:1; /* private: */ __u8 __mono_tc_offset[0]; /* public: */ __u8 mono_delivery_time:1; /* See SKB_MONO_DELIVERY_TIME_MASK */ #ifdef CONFIG_NET_XGRESS __u8 tc_at_ingress:1; /* See TC_AT_INGRESS_MASK */ __u8 tc_skip_classify:1; #endif __u8 remcsum_offload:1; __u8 csum_complete_sw:1; __u8 csum_level:2; __u8 inner_protocol_type:1; __u8 l4_hash:1; __u8 sw_hash:1; #ifdef CONFIG_WIRELESS __u8 wifi_acked_valid:1; __u8 wifi_acked:1; #endif __u8 no_fcs:1; /* Indicates the inner headers are valid in the skbuff. */ __u8 encapsulation:1; __u8 encap_hdr_csum:1; __u8 csum_valid:1; #ifdef CONFIG_IPV6_NDISC_NODETYPE __u8 ndisc_nodetype:2; #endif #if IS_ENABLED(CONFIG_IP_VS) __u8 ipvs_property:1; #endif #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES) __u8 nf_trace:1; #endif #ifdef CONFIG_NET_SWITCHDEV __u8 offload_fwd_mark:1; __u8 offload_l3_fwd_mark:1; #endif __u8 redirected:1; #ifdef CONFIG_NET_REDIRECT __u8 from_ingress:1; #endif #ifdef CONFIG_NETFILTER_SKIP_EGRESS __u8 nf_skip_egress:1; #endif #ifdef CONFIG_TLS_DEVICE __u8 decrypted:1; #endif __u8 slow_gro:1; #if IS_ENABLED(CONFIG_IP_SCTP) __u8 csum_not_inet:1; #endif #if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS) __u16 tc_index; /* traffic control index */ #endif u16 alloc_cpu; union { __wsum csum; struct { __u16 csum_start; __u16 csum_offset; }; }; __u32 priority; int skb_iif; __u32 hash; union { u32 vlan_all; struct { __be16 vlan_proto; __u16 vlan_tci; }; }; #if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS) union { unsigned int napi_id; unsigned int sender_cpu; }; #endif #ifdef CONFIG_NETWORK_SECMARK __u32 secmark; #endif union { __u32 mark; __u32 reserved_tailroom; }; union { __be16 inner_protocol; __u8 inner_ipproto; }; __u16 inner_transport_header; __u16 inner_network_header; __u16 inner_mac_header; __be16 protocol; __u16 transport_header; __u16 network_header; __u16 mac_header; #ifdef CONFIG_KCOV u64 kcov_handle; #endif ); /* end headers group */ /* These elements must be at the end, see alloc_skb() for details. */ sk_buff_data_t tail; sk_buff_data_t end; unsigned char *head, *data; unsigned int truesize; refcount_t users; #ifdef CONFIG_SKB_EXTENSIONS /* only usable after checking ->active_extensions != 0 */ struct skb_ext *extensions; #endif }; /* if you move pkt_type around you also must adapt those constants */ #ifdef __BIG_ENDIAN_BITFIELD #define PKT_TYPE_MAX (7 << 5) #else #define PKT_TYPE_MAX 7 #endif #define PKT_TYPE_OFFSET offsetof(struct sk_buff, __pkt_type_offset) /* if you move tc_at_ingress or mono_delivery_time * around, you also must adapt these constants. */ #ifdef __BIG_ENDIAN_BITFIELD #define SKB_MONO_DELIVERY_TIME_MASK (1 << 7) #define TC_AT_INGRESS_MASK (1 << 6) #else #define SKB_MONO_DELIVERY_TIME_MASK (1 << 0) #define TC_AT_INGRESS_MASK (1 << 1) #endif #define SKB_BF_MONO_TC_OFFSET offsetof(struct sk_buff, __mono_tc_offset) #ifdef __KERNEL__ /* * Handling routines are only of interest to the kernel */ #define SKB_ALLOC_FCLONE 0x01 #define SKB_ALLOC_RX 0x02 #define SKB_ALLOC_NAPI 0x04 /** * skb_pfmemalloc - Test if the skb was allocated from PFMEMALLOC reserves * @skb: buffer */ static inline bool skb_pfmemalloc(const struct sk_buff *skb) { return unlikely(skb->pfmemalloc); } /* * skb might have a dst pointer attached, refcounted or not. * _skb_refdst low order bit is set if refcount was _not_ taken */ #define SKB_DST_NOREF 1UL #define SKB_DST_PTRMASK ~(SKB_DST_NOREF) /** * skb_dst - returns skb dst_entry * @skb: buffer * * Returns skb dst_entry, regardless of reference taken or not. */ static inline struct dst_entry *skb_dst(const struct sk_buff *skb) { /* If refdst was not refcounted, check we still are in a * rcu_read_lock section */ WARN_ON((skb->_skb_refdst & SKB_DST_NOREF) && !rcu_read_lock_held() && !rcu_read_lock_bh_held()); return (struct dst_entry *)(skb->_skb_refdst & SKB_DST_PTRMASK); } /** * skb_dst_set - sets skb dst * @skb: buffer * @dst: dst entry * * Sets skb dst, assuming a reference was taken on dst and should * be released by skb_dst_drop() */ static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst) { skb->slow_gro |= !!dst; skb->_skb_refdst = (unsigned long)dst; } /** * skb_dst_set_noref - sets skb dst, hopefully, without taking reference * @skb: buffer * @dst: dst entry * * Sets skb dst, assuming a reference was not taken on dst. * If dst entry is cached, we do not take reference and dst_release * will be avoided by refdst_drop. If dst entry is not cached, we take * reference, so that last dst_release can destroy the dst immediately. */ static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) { WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); skb->slow_gro |= !!dst; skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF; } /** * skb_dst_is_noref - Test if skb dst isn't refcounted * @skb: buffer */ static inline bool skb_dst_is_noref(const struct sk_buff *skb) { return (skb->_skb_refdst & SKB_DST_NOREF) && skb_dst(skb); } /** * skb_rtable - Returns the skb &rtable * @skb: buffer */ static inline struct rtable *skb_rtable(const struct sk_buff *skb) { return (struct rtable *)skb_dst(skb); } /* For mangling skb->pkt_type from user space side from applications * such as nft, tc, etc, we only allow a conservative subset of * possible pkt_types to be set. */ static inline bool skb_pkt_type_ok(u32 ptype) { return ptype <= PACKET_OTHERHOST; } /** * skb_napi_id - Returns the skb's NAPI id * @skb: buffer */ static inline unsigned int skb_napi_id(const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL return skb->napi_id; #else return 0; #endif } static inline bool skb_wifi_acked_valid(const struct sk_buff *skb) { #ifdef CONFIG_WIRELESS return skb->wifi_acked_valid; #else return 0; #endif } /** * skb_unref - decrement the skb's reference count * @skb: buffer * * Returns true if we can free the skb. */ static inline bool skb_unref(struct sk_buff *skb) { if (unlikely(!skb)) return false; if (likely(refcount_read(&skb->users) == 1)) smp_rmb(); else if (likely(!refcount_dec_and_test(&skb->users))) return false; return true; } static inline bool skb_data_unref(const struct sk_buff *skb, struct skb_shared_info *shinfo) { int bias; if (!skb->cloned) return true; bias = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1; if (atomic_read(&shinfo->dataref) == bias) smp_rmb(); else if (atomic_sub_return(bias, &shinfo->dataref)) return false; return true; } void __fix_address kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason); /** * kfree_skb - free an sk_buff with 'NOT_SPECIFIED' reason * @skb: buffer to free */ static inline void kfree_skb(struct sk_buff *skb) { kfree_skb_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED); } void skb_release_head_state(struct sk_buff *skb); void kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason); void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt); void skb_tx_error(struct sk_buff *skb); static inline void kfree_skb_list(struct sk_buff *segs) { kfree_skb_list_reason(segs, SKB_DROP_REASON_NOT_SPECIFIED); } #ifdef CONFIG_TRACEPOINTS void consume_skb(struct sk_buff *skb); #else static inline void consume_skb(struct sk_buff *skb) { return kfree_skb(skb); } #endif void __consume_stateless_skb(struct sk_buff *skb); void __kfree_skb(struct sk_buff *skb); void kfree_skb_partial(struct sk_buff *skb, bool head_stolen); bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, bool *fragstolen, int *delta_truesize); struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags, int node); struct sk_buff *__build_skb(void *data, unsigned int frag_size); struct sk_buff *build_skb(void *data, unsigned int frag_size); struct sk_buff *build_skb_around(struct sk_buff *skb, void *data, unsigned int frag_size); void skb_attempt_defer_free(struct sk_buff *skb); struct sk_buff *napi_build_skb(void *data, unsigned int frag_size); struct sk_buff *slab_build_skb(void *data); /** * alloc_skb - allocate a network buffer * @size: size to allocate * @priority: allocation mask * * This function is a convenient wrapper around __alloc_skb(). */ static inline struct sk_buff *alloc_skb(unsigned int size, gfp_t priority) { return __alloc_skb(size, priority, 0, NUMA_NO_NODE); } struct sk_buff *alloc_skb_with_frags(unsigned long header_len, unsigned long data_len, int max_page_order, int *errcode, gfp_t gfp_mask); struct sk_buff *alloc_skb_for_msg(struct sk_buff *first); /* Layout of fast clones : [skb1][skb2][fclone_ref] */ struct sk_buff_fclones { struct sk_buff skb1; struct sk_buff skb2; refcount_t fclone_ref; }; /** * skb_fclone_busy - check if fclone is busy * @sk: socket * @skb: buffer * * Returns true if skb is a fast clone, and its clone is not freed. * Some drivers call skb_orphan() in their ndo_start_xmit(), * so we also check that didn't happen. */ static inline bool skb_fclone_busy(const struct sock *sk, const struct sk_buff *skb) { const struct sk_buff_fclones *fclones; fclones = container_of(skb, struct sk_buff_fclones, skb1); return skb->fclone == SKB_FCLONE_ORIG && refcount_read(&fclones->fclone_ref) > 1 && READ_ONCE(fclones->skb2.sk) == sk; } /** * alloc_skb_fclone - allocate a network buffer from fclone cache * @size: size to allocate * @priority: allocation mask * * This function is a convenient wrapper around __alloc_skb(). */ static inline struct sk_buff *alloc_skb_fclone(unsigned int size, gfp_t priority) { return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE); } struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src); void skb_headers_offset_update(struct sk_buff *skb, int off); int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask); struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority); void skb_copy_header(struct sk_buff *new, const struct sk_buff *old); struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t priority); struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, gfp_t gfp_mask, bool fclone); static inline struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask) { return __pskb_copy_fclone(skb, headroom, gfp_mask, false); } int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, gfp_t gfp_mask); struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom); struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom); struct sk_buff *skb_copy_expand(const struct sk_buff *skb, int newheadroom, int newtailroom, gfp_t priority); int __must_check skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, int offset, int len); int __must_check skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len); int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer); int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error); /** * skb_pad - zero pad the tail of an skb * @skb: buffer to pad * @pad: space to pad * * Ensure that a buffer is followed by a padding area that is zero * filled. Used by network drivers which may DMA or transfer data * beyond the buffer end onto the wire. * * May return error in out of memory cases. The skb is freed on error. */ static inline int skb_pad(struct sk_buff *skb, int pad) { return __skb_pad(skb, pad, true); } #define dev_kfree_skb(a) consume_skb(a) int skb_append_pagefrags(struct sk_buff *skb, struct page *page, int offset, size_t size, size_t max_frags); struct skb_seq_state { __u32 lower_offset; __u32 upper_offset; __u32 frag_idx; __u32 stepped_offset; struct sk_buff *root_skb; struct sk_buff *cur_skb; __u8 *frag_data; __u32 frag_off; }; void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, unsigned int to, struct skb_seq_state *st); unsigned int skb_seq_read(unsigned int consumed, const u8 **data, struct skb_seq_state *st); void skb_abort_seq_read(struct skb_seq_state *st); unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, unsigned int to, struct ts_config *config); /* * Packet hash types specify the type of hash in skb_set_hash. * * Hash types refer to the protocol layer addresses which are used to * construct a packet's hash. The hashes are used to differentiate or identify * flows of the protocol layer for the hash type. Hash types are either * layer-2 (L2), layer-3 (L3), or layer-4 (L4). * * Properties of hashes: * * 1) Two packets in different flows have different hash values * 2) Two packets in the same flow should have the same hash value * * A hash at a higher layer is considered to be more specific. A driver should * set the most specific hash possible. * * A driver cannot indicate a more specific hash than the layer at which a hash * was computed. For instance an L3 hash cannot be set as an L4 hash. * * A driver may indicate a hash level which is less specific than the * actual layer the hash was computed on. For instance, a hash computed * at L4 may be considered an L3 hash. This should only be done if the * driver can't unambiguously determine that the HW computed the hash at * the higher layer. Note that the "should" in the second property above * permits this. */ enum pkt_hash_types { PKT_HASH_TYPE_NONE, /* Undefined type */ PKT_HASH_TYPE_L2, /* Input: src_MAC, dest_MAC */ PKT_HASH_TYPE_L3, /* Input: src_IP, dst_IP */ PKT_HASH_TYPE_L4, /* Input: src_IP, dst_IP, src_port, dst_port */ }; static inline void skb_clear_hash(struct sk_buff *skb) { skb->hash = 0; skb->sw_hash = 0; skb->l4_hash = 0; } static inline void skb_clear_hash_if_not_l4(struct sk_buff *skb) { if (!skb->l4_hash) skb_clear_hash(skb); } static inline void __skb_set_hash(struct sk_buff *skb, __u32 hash, bool is_sw, bool is_l4) { skb->l4_hash = is_l4; skb->sw_hash = is_sw; skb->hash = hash; } static inline void skb_set_hash(struct sk_buff *skb, __u32 hash, enum pkt_hash_types type) { /* Used by drivers to set hash from HW */ __skb_set_hash(skb, hash, false, type == PKT_HASH_TYPE_L4); } static inline void __skb_set_sw_hash(struct sk_buff *skb, __u32 hash, bool is_l4) { __skb_set_hash(skb, hash, true, is_l4); } void __skb_get_hash(struct sk_buff *skb); u32 __skb_get_hash_symmetric(const struct sk_buff *skb); u32 skb_get_poff(const struct sk_buff *skb); u32 __skb_get_poff(const struct sk_buff *skb, const void *data, const struct flow_keys_basic *keys, int hlen); __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, const void *data, int hlen_proto); static inline __be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto) { return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0); } void skb_flow_dissector_init(struct flow_dissector *flow_dissector, const struct flow_dissector_key *key, unsigned int key_count); struct bpf_flow_dissector; u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, __be16 proto, int nhoff, int hlen, unsigned int flags); bool __skb_flow_dissect(const struct net *net, const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, __be16 proto, int nhoff, int hlen, unsigned int flags); static inline bool skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, unsigned int flags) { return __skb_flow_dissect(NULL, skb, flow_dissector, target_container, NULL, 0, 0, 0, flags); } static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb, struct flow_keys *flow, unsigned int flags) { memset(flow, 0, sizeof(*flow)); return __skb_flow_dissect(NULL, skb, &flow_keys_dissector, flow, NULL, 0, 0, 0, flags); } static inline bool skb_flow_dissect_flow_keys_basic(const struct net *net, const struct sk_buff *skb, struct flow_keys_basic *flow, const void *data, __be16 proto, int nhoff, int hlen, unsigned int flags) { memset(flow, 0, sizeof(*flow)); return __skb_flow_dissect(net, skb, &flow_keys_basic_dissector, flow, data, proto, nhoff, hlen, flags); } void skb_flow_dissect_meta(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container); /* Gets a skb connection tracking info, ctinfo map should be a * map of mapsize to translate enum ip_conntrack_info states * to user states. */ void skb_flow_dissect_ct(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, u16 *ctinfo_map, size_t mapsize, bool post_ct, u16 zone); void skb_flow_dissect_tunnel_info(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container); void skb_flow_dissect_hash(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container); static inline __u32 skb_get_hash(struct sk_buff *skb) { if (!skb->l4_hash && !skb->sw_hash) __skb_get_hash(skb); return skb->hash; } static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6) { if (!skb->l4_hash && !skb->sw_hash) { struct flow_keys keys; __u32 hash = __get_hash_from_flowi6(fl6, &keys); __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys)); } return skb->hash; } __u32 skb_get_hash_perturb(const struct sk_buff *skb, const siphash_key_t *perturb); static inline __u32 skb_get_hash_raw(const struct sk_buff *skb) { return skb->hash; } static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from) { to->hash = from->hash; to->sw_hash = from->sw_hash; to->l4_hash = from->l4_hash; }; static inline int skb_cmp_decrypted(const struct sk_buff *skb1, const struct sk_buff *skb2) { #ifdef CONFIG_TLS_DEVICE return skb2->decrypted - skb1->decrypted; #else return 0; #endif } static inline void skb_copy_decrypted(struct sk_buff *to, const struct sk_buff *from) { #ifdef CONFIG_TLS_DEVICE to->decrypted = from->decrypted; #endif } #ifdef NET_SKBUFF_DATA_USES_OFFSET static inline unsigned char *skb_end_pointer(const struct sk_buff *skb) { return skb->head + skb->end; } static inline unsigned int skb_end_offset(const struct sk_buff *skb) { return skb->end; } static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) { skb->end = offset; } #else static inline unsigned char *skb_end_pointer(const struct sk_buff *skb) { return skb->end; } static inline unsigned int skb_end_offset(const struct sk_buff *skb) { return skb->end - skb->head; } static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) { skb->end = skb->head + offset; } #endif struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, struct ubuf_info *uarg); void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref); void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, bool success); int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, struct iov_iter *from, size_t length); static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) { return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len); } int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, struct ubuf_info *uarg); /* Internal */ #define skb_shinfo(SKB) ((struct skb_shared_info *)(skb_end_pointer(SKB))) static inline struct skb_shared_hwtstamps *skb_hwtstamps(struct sk_buff *skb) { return &skb_shinfo(skb)->hwtstamps; } static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb) { bool is_zcopy = skb && skb_shinfo(skb)->flags & SKBFL_ZEROCOPY_ENABLE; return is_zcopy ? skb_uarg(skb) : NULL; } static inline bool skb_zcopy_pure(const struct sk_buff *skb) { return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY; } static inline bool skb_zcopy_managed(const struct sk_buff *skb) { return skb_shinfo(skb)->flags & SKBFL_MANAGED_FRAG_REFS; } static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1, const struct sk_buff *skb2) { return skb_zcopy_pure(skb1) == skb_zcopy_pure(skb2); } static inline void net_zcopy_get(struct ubuf_info *uarg) { refcount_inc(&uarg->refcnt); } static inline void skb_zcopy_init(struct sk_buff *skb, struct ubuf_info *uarg) { skb_shinfo(skb)->destructor_arg = uarg; skb_shinfo(skb)->flags |= uarg->flags; } static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg, bool *have_ref) { if (skb && uarg && !skb_zcopy(skb)) { if (unlikely(have_ref && *have_ref)) *have_ref = false; else net_zcopy_get(uarg); skb_zcopy_init(skb, uarg); } } static inline void skb_zcopy_set_nouarg(struct sk_buff *skb, void *val) { skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t) val | 0x1UL); skb_shinfo(skb)->flags |= SKBFL_ZEROCOPY_FRAG; } static inline bool skb_zcopy_is_nouarg(struct sk_buff *skb) { return (uintptr_t) skb_shinfo(skb)->destructor_arg & 0x1UL; } static inline void *skb_zcopy_get_nouarg(struct sk_buff *skb) { return (void *)((uintptr_t) skb_shinfo(skb)->destructor_arg & ~0x1UL); } static inline void net_zcopy_put(struct ubuf_info *uarg) { if (uarg) uarg->callback(NULL, uarg, true); } static inline void net_zcopy_put_abort(struct ubuf_info *uarg, bool have_uref) { if (uarg) { if (uarg->callback == msg_zerocopy_callback) msg_zerocopy_put_abort(uarg, have_uref); else if (have_uref) net_zcopy_put(uarg); } } /* Release a reference on a zerocopy structure */ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success) { struct ubuf_info *uarg = skb_zcopy(skb); if (uarg) { if (!skb_zcopy_is_nouarg(skb)) uarg->callback(skb, uarg, zerocopy_success); skb_shinfo(skb)->flags &= ~SKBFL_ALL_ZEROCOPY; } } void __skb_zcopy_downgrade_managed(struct sk_buff *skb); static inline void skb_zcopy_downgrade_managed(struct sk_buff *skb) { if (unlikely(skb_zcopy_managed(skb))) __skb_zcopy_downgrade_managed(skb); } static inline void skb_mark_not_on_list(struct sk_buff *skb) { skb->next = NULL; } static inline void skb_poison_list(struct sk_buff *skb) { #ifdef CONFIG_DEBUG_NET skb->next = SKB_LIST_POISON_NEXT; #endif } /* Iterate through singly-linked GSO fragments of an skb. */ #define skb_list_walk_safe(first, skb, next_skb) \ for ((skb) = (first), (next_skb) = (skb) ? (skb)->next : NULL; (skb); \ (skb) = (next_skb), (next_skb) = (skb) ? (skb)->next : NULL) static inline void skb_list_del_init(struct sk_buff *skb) { __list_del_entry(&skb->list); skb_mark_not_on_list(skb); } /** * skb_queue_empty - check if a queue is empty * @list: queue head * * Returns true if the queue is empty, false otherwise. */ static inline int skb_queue_empty(const struct sk_buff_head *list) { return list->next == (const struct sk_buff *) list; } /** * skb_queue_empty_lockless - check if a queue is empty * @list: queue head * * Returns true if the queue is empty, false otherwise. * This variant can be used in lockless contexts. */ static inline bool skb_queue_empty_lockless(const struct sk_buff_head *list) { return READ_ONCE(list->next) == (const struct sk_buff *) list; } /** * skb_queue_is_last - check if skb is the last entry in the queue * @list: queue head * @skb: buffer * * Returns true if @skb is the last buffer on the list. */ static inline bool skb_queue_is_last(const struct sk_buff_head *list, const struct sk_buff *skb) { return skb->next == (const struct sk_buff *) list; } /** * skb_queue_is_first - check if skb is the first entry in the queue * @list: queue head * @skb: buffer * * Returns true if @skb is the first buffer on the list. */ static inline bool skb_queue_is_first(const struct sk_buff_head *list, const struct sk_buff *skb) { return skb->prev == (const struct sk_buff *) list; } /** * skb_queue_next - return the next packet in the queue * @list: queue head * @skb: current buffer * * Return the next packet in @list after @skb. It is only valid to * call this if skb_queue_is_last() evaluates to false. */ static inline struct sk_buff *skb_queue_next(const struct sk_buff_head *list, const struct sk_buff *skb) { /* This BUG_ON may seem severe, but if we just return then we * are going to dereference garbage. */ BUG_ON(skb_queue_is_last(list, skb)); return skb->next; } /** * skb_queue_prev - return the prev packet in the queue * @list: queue head * @skb: current buffer * * Return the prev packet in @list before @skb. It is only valid to * call this if skb_queue_is_first() evaluates to false. */ static inline struct sk_buff *skb_queue_prev(const struct sk_buff_head *list, const struct sk_buff *skb) { /* This BUG_ON may seem severe, but if we just return then we * are going to dereference garbage. */ BUG_ON(skb_queue_is_first(list, skb)); return skb->prev; } /** * skb_get - reference buffer * @skb: buffer to reference * * Makes another reference to a socket buffer and returns a pointer * to the buffer. */ static inline struct sk_buff *skb_get(struct sk_buff *skb) { refcount_inc(&skb->users); return skb; } /* * If users == 1, we are the only owner and can avoid redundant atomic changes. */ /** * skb_cloned - is the buffer a clone * @skb: buffer to check * * Returns true if the buffer was generated with skb_clone() and is * one of multiple shared copies of the buffer. Cloned buffers are * shared data so must not be written to under normal circumstances. */ static inline int skb_cloned(const struct sk_buff *skb) { return skb->cloned && (atomic_read(&skb_shinfo(skb)->dataref) & SKB_DATAREF_MASK) != 1; } static inline int skb_unclone(struct sk_buff *skb, gfp_t pri) { might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_cloned(skb)) return pskb_expand_head(skb, 0, 0, pri); return 0; } /* This variant of skb_unclone() makes sure skb->truesize * and skb_end_offset() are not changed, whenever a new skb->head is needed. * * Indeed there is no guarantee that ksize(kmalloc(X)) == ksize(kmalloc(X)) * when various debugging features are in place. */ int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri); static inline int skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) { might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_cloned(skb)) return __skb_unclone_keeptruesize(skb, pri); return 0; } /** * skb_header_cloned - is the header a clone * @skb: buffer to check * * Returns true if modifying the header part of the buffer requires * the data to be copied. */ static inline int skb_header_cloned(const struct sk_buff *skb) { int dataref; if (!skb->cloned) return 0; dataref = atomic_read(&skb_shinfo(skb)->dataref); dataref = (dataref & SKB_DATAREF_MASK) - (dataref >> SKB_DATAREF_SHIFT); return dataref != 1; } static inline int skb_header_unclone(struct sk_buff *skb, gfp_t pri) { might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_header_cloned(skb)) return pskb_expand_head(skb, 0, 0, pri); return 0; } /** * __skb_header_release() - allow clones to use the headroom * @skb: buffer to operate on * * See "DOC: dataref and headerless skbs". */ static inline void __skb_header_release(struct sk_buff *skb) { skb->nohdr = 1; atomic_set(&skb_shinfo(skb)->dataref, 1 + (1 << SKB_DATAREF_SHIFT)); } /** * skb_shared - is the buffer shared * @skb: buffer to check * * Returns true if more than one person has a reference to this * buffer. */ static inline int skb_shared(const struct sk_buff *skb) { return refcount_read(&skb->users) != 1; } /** * skb_share_check - check if buffer is shared and if so clone it * @skb: buffer to check * @pri: priority for memory allocation * * If the buffer is shared the buffer is cloned and the old copy * drops a reference. A new clone with a single reference is returned. * If the buffer is not shared the original buffer is returned. When * being called from interrupt status or with spinlocks held pri must * be GFP_ATOMIC. * * NULL is returned on a memory allocation failure. */ static inline struct sk_buff *skb_share_check(struct sk_buff *skb, gfp_t pri) { might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_shared(skb)) { struct sk_buff *nskb = skb_clone(skb, pri); if (likely(nskb)) consume_skb(skb); else kfree_skb(skb); skb = nskb; } return skb; } /* * Copy shared buffers into a new sk_buff. We effectively do COW on * packets to handle cases where we have a local reader and forward * and a couple of other messy ones. The normal one is tcpdumping * a packet that's being forwarded. */ /** * skb_unshare - make a copy of a shared buffer * @skb: buffer to check * @pri: priority for memory allocation * * If the socket buffer is a clone then this function creates a new * copy of the data, drops a reference count on the old copy and returns * the new copy with the reference count at 1. If the buffer is not a clone * the original buffer is returned. When called with a spinlock held or * from interrupt state @pri must be %GFP_ATOMIC * * %NULL is returned on a memory allocation failure. */ static inline struct sk_buff *skb_unshare(struct sk_buff *skb, gfp_t pri) { might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_cloned(skb)) { struct sk_buff *nskb = skb_copy(skb, pri); /* Free our shared copy */ if (likely(nskb)) consume_skb(skb); else kfree_skb(skb); skb = nskb; } return skb; } /** * skb_peek - peek at the head of an &sk_buff_head * @list_: list to peek at * * Peek an &sk_buff. Unlike most other operations you _MUST_ * be careful with this one. A peek leaves the buffer on the * list and someone else may run off with it. You must hold * the appropriate locks or have a private queue to do this. * * Returns %NULL for an empty list or a pointer to the head element. * The reference count is not incremented and the reference is therefore * volatile. Use with caution. */ static inline struct sk_buff *skb_peek(const struct sk_buff_head *list_) { struct sk_buff *skb = list_->next; if (skb == (struct sk_buff *)list_) skb = NULL; return skb; } /** * __skb_peek - peek at the head of a non-empty &sk_buff_head * @list_: list to peek at * * Like skb_peek(), but the caller knows that the list is not empty. */ static inline struct sk_buff *__skb_peek(const struct sk_buff_head *list_) { return list_->next; } /** * skb_peek_next - peek skb following the given one from a queue * @skb: skb to start from * @list_: list to peek at * * Returns %NULL when the end of the list is met or a pointer to the * next element. The reference count is not incremented and the * reference is therefore volatile. Use with caution. */ static inline struct sk_buff *skb_peek_next(struct sk_buff *skb, const struct sk_buff_head *list_) { struct sk_buff *next = skb->next; if (next == (struct sk_buff *)list_) next = NULL; return next; } /** * skb_peek_tail - peek at the tail of an &sk_buff_head * @list_: list to peek at * * Peek an &sk_buff. Unlike most other operations you _MUST_ * be careful with this one. A peek leaves the buffer on the * list and someone else may run off with it. You must hold * the appropriate locks or have a private queue to do this. * * Returns %NULL for an empty list or a pointer to the tail element. * The reference count is not incremented and the reference is therefore * volatile. Use with caution. */ static inline struct sk_buff *skb_peek_tail(const struct sk_buff_head *list_) { struct sk_buff *skb = READ_ONCE(list_->prev); if (skb == (struct sk_buff *)list_) skb = NULL; return skb; } /** * skb_queue_len - get queue length * @list_: list to measure * * Return the length of an &sk_buff queue. */ static inline __u32 skb_queue_len(const struct sk_buff_head *list_) { return list_->qlen; } /** * skb_queue_len_lockless - get queue length * @list_: list to measure * * Return the length of an &sk_buff queue. * This variant can be used in lockless contexts. */ static inline __u32 skb_queue_len_lockless(const struct sk_buff_head *list_) { return READ_ONCE(list_->qlen); } /** * __skb_queue_head_init - initialize non-spinlock portions of sk_buff_head * @list: queue to initialize * * This initializes only the list and queue length aspects of * an sk_buff_head object. This allows to initialize the list * aspects of an sk_buff_head without reinitializing things like * the spinlock. It can also be used for on-stack sk_buff_head * objects where the spinlock is known to not be used. */ static inline void __skb_queue_head_init(struct sk_buff_head *list) { list->prev = list->next = (struct sk_buff *)list; list->qlen = 0; } /* * This function creates a split out lock class for each invocation; * this is needed for now since a whole lot of users of the skb-queue * infrastructure in drivers have different locking usage (in hardirq) * than the networking core (in softirq only). In the long run either the * network layer or drivers should need annotation to consolidate the * main types of usage into 3 classes. */ static inline void skb_queue_head_init(struct sk_buff_head *list) { spin_lock_init(&list->lock); __skb_queue_head_init(list); } static inline void skb_queue_head_init_class(struct sk_buff_head *list, struct lock_class_key *class) { skb_queue_head_init(list); lockdep_set_class(&list->lock, class); } /* * Insert an sk_buff on a list. * * The "__skb_xxxx()" functions are the non-atomic ones that * can only be called with interrupts disabled. */ static inline void __skb_insert(struct sk_buff *newsk, struct sk_buff *prev, struct sk_buff *next, struct sk_buff_head *list) { /* See skb_queue_empty_lockless() and skb_peek_tail() * for the opposite READ_ONCE() */ WRITE_ONCE(newsk->next, next); WRITE_ONCE(newsk->prev, prev); WRITE_ONCE(((struct sk_buff_list *)next)->prev, newsk); WRITE_ONCE(((struct sk_buff_list *)prev)->next, newsk); WRITE_ONCE(list->qlen, list->qlen + 1); } static inline void __skb_queue_splice(const struct sk_buff_head *list, struct sk_buff *prev, struct sk_buff *next) { struct sk_buff *first = list->next; struct sk_buff *last = list->prev; WRITE_ONCE(first->prev, prev); WRITE_ONCE(prev->next, first); WRITE_ONCE(last->next, next); WRITE_ONCE(next->prev, last); } /** * skb_queue_splice - join two skb lists, this is designed for stacks * @list: the new list to add * @head: the place to add it in the first list */ static inline void skb_queue_splice(const struct sk_buff_head *list, struct sk_buff_head *head) { if (!skb_queue_empty(list)) { __skb_queue_splice(list, (struct sk_buff *) head, head->next); head->qlen += list->qlen; } } /** * skb_queue_splice_init - join two skb lists and reinitialise the emptied list * @list: the new list to add * @head: the place to add it in the first list * * The list at @list is reinitialised */ static inline void skb_queue_splice_init(struct sk_buff_head *list, struct sk_buff_head *head) { if (!skb_queue_empty(list)) { __skb_queue_splice(list, (struct sk_buff *) head, head->next); head->qlen += list->qlen; __skb_queue_head_init(list); } } /** * skb_queue_splice_tail - join two skb lists, each list being a queue * @list: the new list to add * @head: the place to add it in the first list */ static inline void skb_queue_splice_tail(const struct sk_buff_head *list, struct sk_buff_head *head) { if (!skb_queue_empty(list)) { __skb_queue_splice(list, head->prev, (struct sk_buff *) head); head->qlen += list->qlen; } } /** * skb_queue_splice_tail_init - join two skb lists and reinitialise the emptied list * @list: the new list to add * @head: the place to add it in the first list * * Each of the lists is a queue. * The list at @list is reinitialised */ static inline void skb_queue_splice_tail_init(struct sk_buff_head *list, struct sk_buff_head *head) { if (!skb_queue_empty(list)) { __skb_queue_splice(list, head->prev, (struct sk_buff *) head); head->qlen += list->qlen; __skb_queue_head_init(list); } } /** * __skb_queue_after - queue a buffer at the list head * @list: list to use * @prev: place after this buffer * @newsk: buffer to queue * * Queue a buffer int the middle of a list. This function takes no locks * and you must therefore hold required locks before calling it. * * A buffer cannot be placed on two lists at the same time. */ static inline void __skb_queue_after(struct sk_buff_head *list, struct sk_buff *prev, struct sk_buff *newsk) { __skb_insert(newsk, prev, ((struct sk_buff_list *)prev)->next, list); } void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list); static inline void __skb_queue_before(struct sk_buff_head *list, struct sk_buff *next, struct sk_buff *newsk) { __skb_insert(newsk, ((struct sk_buff_list *)next)->prev, next, list); } /** * __skb_queue_head - queue a buffer at the list head * @list: list to use * @newsk: buffer to queue * * Queue a buffer at the start of a list. This function takes no locks * and you must therefore hold required locks before calling it. * * A buffer cannot be placed on two lists at the same time. */ static inline void __skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) { __skb_queue_after(list, (struct sk_buff *)list, newsk); } void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk); /** * __skb_queue_tail - queue a buffer at the list tail * @list: list to use * @newsk: buffer to queue * * Queue a buffer at the end of a list. This function takes no locks * and you must therefore hold required locks before calling it. * * A buffer cannot be placed on two lists at the same time. */ static inline void __skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) { __skb_queue_before(list, (struct sk_buff *)list, newsk); } void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk); /* * remove sk_buff from list. _Must_ be called atomically, and with * the list known.. */ void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list); static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) { struct sk_buff *next, *prev; WRITE_ONCE(list->qlen, list->qlen - 1); next = skb->next; prev = skb->prev; skb->next = skb->prev = NULL; WRITE_ONCE(next->prev, prev); WRITE_ONCE(prev->next, next); } /** * __skb_dequeue - remove from the head of the queue * @list: list to dequeue from * * Remove the head of the list. This function does not take any locks * so must be used with appropriate locks held only. The head item is * returned or %NULL if the list is empty. */ static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list) { struct sk_buff *skb = skb_peek(list); if (skb) __skb_unlink(skb, list); return skb; } struct sk_buff *skb_dequeue(struct sk_buff_head *list); /** * __skb_dequeue_tail - remove from the tail of the queue * @list: list to dequeue from * * Remove the tail of the list. This function does not take any locks * so must be used with appropriate locks held only. The tail item is * returned or %NULL if the list is empty. */ static inline struct sk_buff *__skb_dequeue_tail(struct sk_buff_head *list) { struct sk_buff *skb = skb_peek_tail(list); if (skb) __skb_unlink(skb, list); return skb; } struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list); static inline bool skb_is_nonlinear(const struct sk_buff *skb) { return skb->data_len; } static inline unsigned int skb_headlen(const struct sk_buff *skb) { return skb->len - skb->data_len; } static inline unsigned int __skb_pagelen(const struct sk_buff *skb) { unsigned int i, len = 0; |