Total coverage: 275148 (18%)of 1580660
2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 // SPDX-License-Identifier: GPL-2.0+ /* * copyright (C) 1999/2000 by Henning Zabel <henning@uni-paderborn.de> */ /* * USB-Kernel Driver for the Mustek MDC800 Digital Camera * (c) 1999/2000 Henning Zabel <henning@uni-paderborn.de> * * * The driver brings the USB functions of the MDC800 to Linux. * To use the Camera you must support the USB Protocol of the camera * to the Kernel Node. * The Driver uses a misc device Node. Create it with : * mknod /dev/mustek c 180 32 * * The driver supports only one camera. * * Fix: mdc800 used sleep_on and slept with io_lock held. * Converted sleep_on to waitqueues with schedule_timeout and made io_lock * a semaphore from a spinlock. * by Oliver Neukum <oliver@neukum.name> * (02/12/2001) * * Identify version on module load. * (08/04/2001) gb * * version 0.7.5 * Fixed potential SMP races with Spinlocks. * Thanks to Oliver Neukum <oliver@neukum.name> who * noticed the race conditions. * (30/10/2000) * * Fixed: Setting urb->dev before submitting urb. * by Greg KH <greg@kroah.com> * (13/10/2000) * * version 0.7.3 * bugfix : The mdc800->state field gets set to READY after the * disconnect function sets it to NOT_CONNECTED. This makes the * driver running like the camera is connected and causes some * hang ups. * * version 0.7.1 * MOD_INC and MOD_DEC are changed in usb_probe to prevent load/unload * problems when compiled as Module. * (04/04/2000) * * The mdc800 driver gets assigned the USB Minor 32-47. The Registration * was updated to use these values. * (26/03/2000) * * The Init und Exit Module Function are updated. * (01/03/2000) * * version 0.7.0 * Rewrite of the driver : The driver now uses URB's. The old stuff * has been removed. * * version 0.6.0 * Rewrite of this driver: The Emulation of the rs232 protocoll * has been removed from the driver. A special executeCommand function * for this driver is included to gphoto. * The driver supports two kind of communication to bulk endpoints. * Either with the dev->bus->ops->bulk... or with callback function. * (09/11/1999) * * version 0.5.0: * first Version that gets a version number. Most of the needed * functions work. * (20/10/1999) */ #include <linux/sched/signal.h> #include <linux/signal.h> #include <linux/spinlock.h> #include <linux/errno.h> #include <linux/random.h> #include <linux/poll.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/wait.h> #include <linux/mutex.h> #include <linux/usb.h> #include <linux/fs.h> /* * Version Information */ #define DRIVER_VERSION "v0.7.5 (30/10/2000)" #define DRIVER_AUTHOR "Henning Zabel <henning@uni-paderborn.de>" #define DRIVER_DESC "USB Driver for Mustek MDC800 Digital Camera" /* Vendor and Product Information */ #define MDC800_VENDOR_ID 0x055f #define MDC800_PRODUCT_ID 0xa800 /* Timeouts (msec) */ #define TO_DOWNLOAD_GET_READY 1500 #define TO_DOWNLOAD_GET_BUSY 1500 #define TO_WRITE_GET_READY 1000 #define TO_DEFAULT_COMMAND 5000 #define TO_READ_FROM_IRQ TO_DEFAULT_COMMAND #define TO_GET_READY TO_DEFAULT_COMMAND /* Minor Number of the device (create with mknod /dev/mustek c 180 32) */ #define MDC800_DEVICE_MINOR_BASE 32 /************************************************************************** Data and structs ***************************************************************************/ typedef enum { NOT_CONNECTED, READY, WORKING, DOWNLOAD } mdc800_state; /* Data for the driver */ struct mdc800_data { struct usb_device * dev; // Device Data mdc800_state state; unsigned int endpoint [4]; struct urb * irq_urb; wait_queue_head_t irq_wait; int irq_woken; char* irq_urb_buffer; int camera_busy; // is camera busy ? int camera_request_ready; // Status to synchronize with irq char camera_response [8]; // last Bytes send after busy struct urb * write_urb; char* write_urb_buffer; wait_queue_head_t write_wait; int written; struct urb * download_urb; char* download_urb_buffer; wait_queue_head_t download_wait; int downloaded; int download_left; // Bytes left to download ? /* Device Data */ char out [64]; // Answer Buffer int out_ptr; // Index to the first not readen byte int out_count; // Bytes in the buffer int open; // Camera device open ? struct mutex io_lock; // IO -lock char in [8]; // Command Input Buffer int in_count; int pic_index; // Cache for the Imagesize (-1 for nothing cached ) int pic_len; int minor; }; /* Specification of the Endpoints */ static struct usb_endpoint_descriptor mdc800_ed [4] = { { .bLength = 0, .bDescriptorType = 0, .bEndpointAddress = 0x01, .bmAttributes = 0x02, .wMaxPacketSize = cpu_to_le16(8), .bInterval = 0, .bRefresh = 0, .bSynchAddress = 0, }, { .bLength = 0, .bDescriptorType = 0, .bEndpointAddress = 0x82, .bmAttributes = 0x03, .wMaxPacketSize = cpu_to_le16(8), .bInterval = 0, .bRefresh = 0, .bSynchAddress = 0, }, { .bLength = 0, .bDescriptorType = 0, .bEndpointAddress = 0x03, .bmAttributes = 0x02, .wMaxPacketSize = cpu_to_le16(64), .bInterval = 0, .bRefresh = 0, .bSynchAddress = 0, }, { .bLength = 0, .bDescriptorType = 0, .bEndpointAddress = 0x84, .bmAttributes = 0x02, .wMaxPacketSize = cpu_to_le16(64), .bInterval = 0, .bRefresh = 0, .bSynchAddress = 0, }, }; /* The Variable used by the driver */ static struct mdc800_data* mdc800; /*************************************************************************** The USB Part of the driver ****************************************************************************/ static int mdc800_endpoint_equals (struct usb_endpoint_descriptor *a,struct usb_endpoint_descriptor *b) { return ( ( a->bEndpointAddress == b->bEndpointAddress ) && ( a->bmAttributes == b->bmAttributes ) && ( a->wMaxPacketSize == b->wMaxPacketSize ) ); } /* * Checks whether the camera responds busy */ static int mdc800_isBusy (char* ch) { int i=0; while (i<8) { if (ch [i] != (char)0x99) return 0; i++; } return 1; } /* * Checks whether the Camera is ready */ static int mdc800_isReady (char *ch) { int i=0; while (i<8) { if (ch [i] != (char)0xbb) return 0; i++; } return 1; } /* * USB IRQ Handler for InputLine */ static void mdc800_usb_irq (struct urb *urb) { int data_received=0, wake_up; unsigned char* b=urb->transfer_buffer; struct mdc800_data* mdc800=urb->context; struct device *dev = &mdc800->dev->dev; int status = urb->status; if (status >= 0) { if (mdc800_isBusy (b)) { if (!mdc800->camera_busy) { mdc800->camera_busy=1; dev_dbg(dev, "gets busy\n"); } } else { if (mdc800->camera_busy && mdc800_isReady (b)) { mdc800->camera_busy=0; dev_dbg(dev, "gets ready\n"); } } if (!(mdc800_isBusy (b) || mdc800_isReady (b))) { /* Store Data in camera_answer field */ dev_dbg(dev, "%i %i %i %i %i %i %i %i \n",b[0],b[1],b[2],b[3],b[4],b[5],b[6],b[7]); memcpy (mdc800->camera_response,b,8); data_received=1; } } wake_up= ( mdc800->camera_request_ready > 0 ) && ( ((mdc800->camera_request_ready == 1) && (!mdc800->camera_busy)) || ((mdc800->camera_request_ready == 2) && data_received) || ((mdc800->camera_request_ready == 3) && (mdc800->camera_busy)) || (status < 0) ); if (wake_up) { mdc800->camera_request_ready=0; mdc800->irq_woken=1; wake_up (&mdc800->irq_wait); } } /* * Waits a while until the irq responds that camera is ready * * mode : 0: Wait for camera gets ready * 1: Wait for receiving data * 2: Wait for camera gets busy * * msec: Time to wait */ static int mdc800_usb_waitForIRQ (int mode, int msec) { mdc800->camera_request_ready=1+mode; wait_event_timeout(mdc800->irq_wait, mdc800->irq_woken, msecs_to_jiffies(msec)); mdc800->irq_woken = 0; if (mdc800->camera_request_ready>0) { mdc800->camera_request_ready=0; dev_err(&mdc800->dev->dev, "timeout waiting for camera.\n"); return -1; } if (mdc800->state == NOT_CONNECTED) { printk(KERN_WARNING "mdc800: Camera gets disconnected " "during waiting for irq.\n"); mdc800->camera_request_ready=0; return -2; } return 0; } /* * The write_urb callback function */ static void mdc800_usb_write_notify (struct urb *urb) { struct mdc800_data* mdc800=urb->context; int status = urb->status; if (status != 0) dev_err(&mdc800->dev->dev, "writing command fails (status=%i)\n", status); else mdc800->state=READY; mdc800->written = 1; wake_up (&mdc800->write_wait); } /* * The download_urb callback function */ static void mdc800_usb_download_notify (struct urb *urb) { struct mdc800_data* mdc800=urb->context; int status = urb->status; if (status == 0) { /* Fill output buffer with these data */ memcpy (mdc800->out, urb->transfer_buffer, 64); mdc800->out_count=64; mdc800->out_ptr=0; mdc800->download_left-=64; if (mdc800->download_left == 0) { mdc800->state=READY; } } else { dev_err(&mdc800->dev->dev, "request bytes fails (status:%i)\n", status); } mdc800->downloaded = 1; wake_up (&mdc800->download_wait); } /*************************************************************************** Probing for the Camera ***************************************************************************/ static struct usb_driver mdc800_usb_driver; static const struct file_operations mdc800_device_ops; static struct usb_class_driver mdc800_class = { .name = "mdc800%d", .fops = &mdc800_device_ops, .minor_base = MDC800_DEVICE_MINOR_BASE, }; /* * Callback to search the Mustek MDC800 on the USB Bus */ static int mdc800_usb_probe (struct usb_interface *intf, const struct usb_device_id *id) { int i,j; struct usb_host_interface *intf_desc; struct usb_device *dev = interface_to_usbdev (intf); int irq_interval=0; int retval; dev_dbg(&intf->dev, "(%s) called.\n", __func__); if (mdc800->dev != NULL) { dev_warn(&intf->dev, "only one Mustek MDC800 is supported.\n"); return -ENODEV; } if (dev->descriptor.bNumConfigurations != 1) { dev_err(&intf->dev, "probe fails -> wrong Number of Configuration\n"); return -ENODEV; } intf_desc = intf->cur_altsetting; if ( ( intf_desc->desc.bInterfaceClass != 0xff ) || ( intf_desc->desc.bInterfaceSubClass != 0 ) || ( intf_desc->desc.bInterfaceProtocol != 0 ) || ( intf_desc->desc.bNumEndpoints != 4) ) { dev_err(&intf->dev, "probe fails -> wrong Interface\n"); return -ENODEV; } /* Check the Endpoints */ for (i=0; i<4; i++) { mdc800->endpoint[i]=-1; for (j=0; j<4; j++) { if (mdc800_endpoint_equals (&intf_desc->endpoint [j].desc,&mdc800_ed [i])) { mdc800->endpoint[i]=intf_desc->endpoint [j].desc.bEndpointAddress ; if (i==1) { irq_interval=intf_desc->endpoint [j].desc.bInterval; } } } if (mdc800->endpoint[i] == -1) { dev_err(&intf->dev, "probe fails -> Wrong Endpoints.\n"); return -ENODEV; } } dev_info(&intf->dev, "Found Mustek MDC800 on USB.\n"); mutex_lock(&mdc800->io_lock); retval = usb_register_dev(intf, &mdc800_class); if (retval) { dev_err(&intf->dev, "Not able to get a minor for this device.\n"); mutex_unlock(&mdc800->io_lock); return -ENODEV; } mdc800->dev=dev; mdc800->open=0; /* Setup URB Structs */ usb_fill_int_urb ( mdc800->irq_urb, mdc800->dev, usb_rcvintpipe (mdc800->dev,mdc800->endpoint [1]), mdc800->irq_urb_buffer, 8, mdc800_usb_irq, mdc800, irq_interval ); usb_fill_bulk_urb ( mdc800->write_urb, mdc800->dev, usb_sndbulkpipe (mdc800->dev, mdc800->endpoint[0]), mdc800->write_urb_buffer, 8, mdc800_usb_write_notify, mdc800 ); usb_fill_bulk_urb ( mdc800->download_urb, mdc800->dev, usb_rcvbulkpipe (mdc800->dev, mdc800->endpoint [3]), mdc800->download_urb_buffer, 64, mdc800_usb_download_notify, mdc800 ); mdc800->state=READY; mutex_unlock(&mdc800->io_lock); usb_set_intfdata(intf, mdc800); return 0; } /* * Disconnect USB device (maybe the MDC800) */ static void mdc800_usb_disconnect (struct usb_interface *intf) { struct mdc800_data* mdc800 = usb_get_intfdata(intf); dev_dbg(&intf->dev, "(%s) called\n", __func__); if (mdc800) { if (mdc800->state == NOT_CONNECTED) return; usb_deregister_dev(intf, &mdc800_class); /* must be under lock to make sure no URB is submitted after usb_kill_urb() */ mutex_lock(&mdc800->io_lock); mdc800->state=NOT_CONNECTED; usb_kill_urb(mdc800->irq_urb); usb_kill_urb(mdc800->write_urb); usb_kill_urb(mdc800->download_urb); mutex_unlock(&mdc800->io_lock); mdc800->dev = NULL; usb_set_intfdata(intf, NULL); } dev_info(&intf->dev, "Mustek MDC800 disconnected from USB.\n"); } /*************************************************************************** The Misc device Part (file_operations) ****************************************************************************/ /* * This Function calc the Answersize for a command. */ static int mdc800_getAnswerSize (char command) { switch ((unsigned char) command) { case 0x2a: case 0x49: case 0x51: case 0x0d: case 0x20: case 0x07: case 0x01: case 0x25: case 0x00: return 8; case 0x05: case 0x3e: return mdc800->pic_len; case 0x09: return 4096; default: return 0; } } /* * Init the device: (1) alloc mem (2) Increase MOD Count .. */ static int mdc800_device_open (struct inode* inode, struct file *file) { int retval=0; int errn=0; mutex_lock(&mdc800->io_lock); if (mdc800->state == NOT_CONNECTED) { errn=-EBUSY; goto error_out; } if (mdc800->open) { errn=-EBUSY; goto error_out; } mdc800->in_count=0; mdc800->out_count=0; mdc800->out_ptr=0; mdc800->pic_index=0; mdc800->pic_len=-1; mdc800->download_left=0; mdc800->camera_busy=0; mdc800->camera_request_ready=0; mdc800->irq_urb->dev = mdc800->dev; retval = usb_submit_urb (mdc800->irq_urb, GFP_KERNEL); if (retval) { dev_err(&mdc800->dev->dev, "request USB irq fails (submit_retval=%i).\n", retval); errn = -EIO; goto error_out; } mdc800->open=1; dev_dbg(&mdc800->dev->dev, "Mustek MDC800 device opened.\n"); error_out: mutex_unlock(&mdc800->io_lock); return errn; } /* * Close the Camera and release Memory */ static int mdc800_device_release (struct inode* inode, struct file *file) { int retval=0; mutex_lock(&mdc800->io_lock); if (mdc800->open && (mdc800->state != NOT_CONNECTED)) { usb_kill_urb(mdc800->irq_urb); usb_kill_urb(mdc800->write_urb); usb_kill_urb(mdc800->download_urb); mdc800->open=0; } else { retval=-EIO; } mutex_unlock(&mdc800->io_lock); return retval; } /* * The Device read callback Function */ static ssize_t mdc800_device_read (struct file *file, char __user *buf, size_t len, loff_t *pos) { size_t left=len, sts=len; /* single transfer size */ char __user *ptr = buf; int retval; mutex_lock(&mdc800->io_lock); if (mdc800->state == NOT_CONNECTED) { mutex_unlock(&mdc800->io_lock); return -EBUSY; } if (mdc800->state == WORKING) { printk(KERN_WARNING "mdc800: Illegal State \"working\"" "reached during read ?!\n"); mutex_unlock(&mdc800->io_lock); return -EBUSY; } if (!mdc800->open) { mutex_unlock(&mdc800->io_lock); return -EBUSY; } while (left) { if (signal_pending (current)) { mutex_unlock(&mdc800->io_lock); return -EINTR; } sts=left > (mdc800->out_count-mdc800->out_ptr)?mdc800->out_count-mdc800->out_ptr:left; if (sts <= 0) { /* Too less Data in buffer */ if (mdc800->state == DOWNLOAD) { mdc800->out_count=0; mdc800->out_ptr=0; /* Download -> Request new bytes */ mdc800->download_urb->dev = mdc800->dev; retval = usb_submit_urb (mdc800->download_urb, GFP_KERNEL); if (retval) { dev_err(&mdc800->dev->dev, "Can't submit download urb " "(retval=%i)\n", retval); mutex_unlock(&mdc800->io_lock); return len-left; } wait_event_timeout(mdc800->download_wait, mdc800->downloaded, msecs_to_jiffies(TO_DOWNLOAD_GET_READY)); mdc800->downloaded = 0; if (mdc800->download_urb->status != 0) { dev_err(&mdc800->dev->dev, "request download-bytes fails " "(status=%i)\n", mdc800->download_urb->status); mutex_unlock(&mdc800->io_lock); return len-left; } } else { /* No more bytes -> that's an error*/ mutex_unlock(&mdc800->io_lock); return -EIO; } } else { /* Copy Bytes */ if (copy_to_user(ptr, &mdc800->out [mdc800->out_ptr], sts)) { mutex_unlock(&mdc800->io_lock); return -EFAULT; } ptr+=sts; left-=sts; mdc800->out_ptr+=sts; } } mutex_unlock(&mdc800->io_lock); return len-left; } /* * The Device write callback Function * If a 8Byte Command is received, it will be send to the camera. * After this the driver initiates the request for the answer or * just waits until the camera becomes ready. */ static ssize_t mdc800_device_write (struct file *file, const char __user *buf, size_t len, loff_t *pos) { size_t i=0; int retval; mutex_lock(&mdc800->io_lock); if (mdc800->state != READY) { mutex_unlock(&mdc800->io_lock); return -EBUSY; } if (!mdc800->open ) { mutex_unlock(&mdc800->io_lock); return -EBUSY; } while (i<len) { unsigned char c; if (signal_pending (current)) { mutex_unlock(&mdc800->io_lock); return -EINTR; } if(get_user(c, buf+i)) { mutex_unlock(&mdc800->io_lock); return -EFAULT; } /* check for command start */ if (c == 0x55) { mdc800->in_count=0; mdc800->out_count=0; mdc800->out_ptr=0; mdc800->download_left=0; } /* save command byte */ if (mdc800->in_count < 8) { mdc800->in[mdc800->in_count] = c; mdc800->in_count++; } else { mutex_unlock(&mdc800->io_lock); return -EIO; } /* Command Buffer full ? -> send it to camera */ if (mdc800->in_count == 8) { int answersize; if (mdc800_usb_waitForIRQ (0,TO_GET_READY)) { dev_err(&mdc800->dev->dev, "Camera didn't get ready.\n"); mutex_unlock(&mdc800->io_lock); return -EIO; } answersize=mdc800_getAnswerSize (mdc800->in[1]); mdc800->state=WORKING; memcpy (mdc800->write_urb->transfer_buffer, mdc800->in,8); mdc800->write_urb->dev = mdc800->dev; retval = usb_submit_urb (mdc800->write_urb, GFP_KERNEL); if (retval) { dev_err(&mdc800->dev->dev, "submitting write urb fails " "(retval=%i)\n", retval); mutex_unlock(&mdc800->io_lock); return -EIO; } wait_event_timeout(mdc800->write_wait, mdc800->written, msecs_to_jiffies(TO_WRITE_GET_READY)); mdc800->written = 0; if (mdc800->state == WORKING) { usb_kill_urb(mdc800->write_urb); mutex_unlock(&mdc800->io_lock); return -EIO; } switch ((unsigned char) mdc800->in[1]) { case 0x05: /* Download Image */ case 0x3e: /* Take shot in Fine Mode (WCam Mode) */ if (mdc800->pic_len < 0) { dev_err(&mdc800->dev->dev, "call 0x07 before " "0x05,0x3e\n"); mdc800->state=READY; mutex_unlock(&mdc800->io_lock); return -EIO; } mdc800->pic_len=-1; fallthrough; case 0x09: /* Download Thumbnail */ mdc800->download_left=answersize+64; mdc800->state=DOWNLOAD; mdc800_usb_waitForIRQ (0,TO_DOWNLOAD_GET_BUSY); break; default: if (answersize) { if (mdc800_usb_waitForIRQ (1,TO_READ_FROM_IRQ)) { dev_err(&mdc800->dev->dev, "requesting answer from irq fails\n"); mutex_unlock(&mdc800->io_lock); return -EIO; } /* Write dummy data, (this is ugly but part of the USB Protocol */ /* if you use endpoint 1 as bulk and not as irq) */ memcpy (mdc800->out, mdc800->camera_response,8); /* This is the interpreted answer */ memcpy (&mdc800->out[8], mdc800->camera_response,8); mdc800->out_ptr=0; mdc800->out_count=16; /* Cache the Imagesize, if command was getImageSize */ if (mdc800->in [1] == (char) 0x07) { mdc800->pic_len=(int) 65536*(unsigned char) mdc800->camera_response[0]+256*(unsigned char) mdc800->camera_response[1]+(unsigned char) mdc800->camera_response[2]; dev_dbg(&mdc800->dev->dev, "cached imagesize = %i\n", mdc800->pic_len); } } else { if (mdc800_usb_waitForIRQ (0,TO_DEFAULT_COMMAND)) { dev_err(&mdc800->dev->dev, "Command Timeout.\n"); mutex_unlock(&mdc800->io_lock); return -EIO; } } mdc800->state=READY; break; } } i++; } mutex_unlock(&mdc800->io_lock); return i; } /*************************************************************************** Init and Cleanup this driver (Structs and types) ****************************************************************************/ /* File Operations of this drivers */ static const struct file_operations mdc800_device_ops = { .owner = THIS_MODULE, .read = mdc800_device_read, .write = mdc800_device_write, .open = mdc800_device_open, .release = mdc800_device_release, .llseek = noop_llseek, }; static const struct usb_device_id mdc800_table[] = { { USB_DEVICE(MDC800_VENDOR_ID, MDC800_PRODUCT_ID) }, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE (usb, mdc800_table); /* * USB Driver Struct for this device */ static struct usb_driver mdc800_usb_driver = { .name = "mdc800", .probe = mdc800_usb_probe, .disconnect = mdc800_usb_disconnect, .id_table = mdc800_table }; /************************************************************************ Init and Cleanup this driver (Main Functions) *************************************************************************/ static int __init usb_mdc800_init (void) { int retval = -ENODEV; /* Allocate Memory */ mdc800=kzalloc (sizeof (struct mdc800_data), GFP_KERNEL); if (!mdc800) goto cleanup_on_fail; mdc800->dev = NULL; mdc800->state=NOT_CONNECTED; mutex_init (&mdc800->io_lock); init_waitqueue_head (&mdc800->irq_wait); init_waitqueue_head (&mdc800->write_wait); init_waitqueue_head (&mdc800->download_wait); mdc800->irq_woken = 0; mdc800->downloaded = 0; mdc800->written = 0; mdc800->irq_urb_buffer=kmalloc (8, GFP_KERNEL); if (!mdc800->irq_urb_buffer) goto cleanup_on_fail; mdc800->write_urb_buffer=kmalloc (8, GFP_KERNEL); if (!mdc800->write_urb_buffer) goto cleanup_on_fail; mdc800->download_urb_buffer=kmalloc (64, GFP_KERNEL); if (!mdc800->download_urb_buffer) goto cleanup_on_fail; mdc800->irq_urb=usb_alloc_urb (0, GFP_KERNEL); if (!mdc800->irq_urb) goto cleanup_on_fail; mdc800->download_urb=usb_alloc_urb (0, GFP_KERNEL); if (!mdc800->download_urb) goto cleanup_on_fail; mdc800->write_urb=usb_alloc_urb (0, GFP_KERNEL); if (!mdc800->write_urb) goto cleanup_on_fail; /* Register the driver */ retval = usb_register(&mdc800_usb_driver); if (retval) goto cleanup_on_fail; printk(KERN_INFO KBUILD_MODNAME ": " DRIVER_VERSION ":" DRIVER_DESC "\n"); return 0; /* Clean driver up, when something fails */ cleanup_on_fail: if (mdc800 != NULL) { printk(KERN_ERR "mdc800: can't alloc memory!\n"); kfree(mdc800->download_urb_buffer); kfree(mdc800->write_urb_buffer); kfree(mdc800->irq_urb_buffer); usb_free_urb(mdc800->write_urb); usb_free_urb(mdc800->download_urb); usb_free_urb(mdc800->irq_urb); kfree (mdc800); } mdc800 = NULL; return retval; } static void __exit usb_mdc800_cleanup (void) { usb_deregister (&mdc800_usb_driver); usb_free_urb (mdc800->irq_urb); usb_free_urb (mdc800->download_urb); usb_free_urb (mdc800->write_urb); kfree (mdc800->irq_urb_buffer); kfree (mdc800->write_urb_buffer); kfree (mdc800->download_urb_buffer); kfree (mdc800); mdc800 = NULL; } module_init (usb_mdc800_init); module_exit (usb_mdc800_cleanup); MODULE_AUTHOR( DRIVER_AUTHOR ); MODULE_DESCRIPTION( DRIVER_DESC ); MODULE_LICENSE("GPL");
20 10 20 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 /* * linux/fs/nls/nls_cp1251.c * * Charset cp1251 translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x0402, 0x0403, 0x201a, 0x0453, 0x201e, 0x2026, 0x2020, 0x2021, 0x20ac, 0x2030, 0x0409, 0x2039, 0x040a, 0x040c, 0x040b, 0x040f, /* 0x90*/ 0x0452, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, 0x0000, 0x2122, 0x0459, 0x203a, 0x045a, 0x045c, 0x045b, 0x045f, /* 0xa0*/ 0x00a0, 0x040e, 0x045e, 0x0408, 0x00a4, 0x0490, 0x00a6, 0x00a7, 0x0401, 0x00a9, 0x0404, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x0407, /* 0xb0*/ 0x00b0, 0x00b1, 0x0406, 0x0456, 0x0491, 0x00b5, 0x00b6, 0x00b7, 0x0451, 0x2116, 0x0454, 0x00bb, 0x0458, 0x0405, 0x0455, 0x0457, /* 0xc0*/ 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, /* 0xd0*/ 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, /* 0xe0*/ 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f, /* 0xf0*/ 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0x00, 0xa9, 0x00, 0xab, 0xac, 0xad, 0xae, 0x00, /* 0xa8-0xaf */ 0xb0, 0xb1, 0x00, 0x00, 0x00, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page04[256] = { 0x00, 0xa8, 0x80, 0x81, 0xaa, 0xbd, 0xb2, 0xaf, /* 0x00-0x07 */ 0xa3, 0x8a, 0x8c, 0x8e, 0x8d, 0x00, 0xa1, 0x8f, /* 0x08-0x0f */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0x10-0x17 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0x18-0x1f */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0x20-0x27 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0x28-0x2f */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x30-0x37 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x38-0x3f */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0x40-0x47 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0x48-0x4f */ 0x00, 0xb8, 0x90, 0x83, 0xba, 0xbe, 0xb3, 0xbf, /* 0x50-0x57 */ 0xbc, 0x9a, 0x9c, 0x9e, 0x9d, 0x00, 0xa2, 0x9f, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0xa5, 0xb4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ }; static const unsigned char page20[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x96, 0x97, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x91, 0x92, 0x82, 0x00, 0x93, 0x94, 0x84, 0x00, /* 0x18-0x1f */ 0x86, 0x87, 0x95, 0x00, 0x00, 0x00, 0x85, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x8b, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ }; static const unsigned char page21[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb9, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ }; static const unsigned char *const page_uni2charset[256] = { page00, NULL, NULL, NULL, page04, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page20, page21, NULL, NULL, NULL, NULL, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x90, 0x83, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x9a, 0x8b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa2, 0xa2, 0xbc, 0xa4, 0xb4, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xb8, 0xa9, 0xba, 0xab, 0xac, 0xad, 0xae, 0xbf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb3, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbe, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xd0-0xd7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x81, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x80, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x8a, 0x9b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa1, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb2, 0xa5, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xa8, 0xb9, 0xaa, 0xbb, 0xa3, 0xbd, 0xbd, 0xaf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xf0-0xf7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "cp1251", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_cp1251(void) { return register_nls(&table); } static void __exit exit_nls_cp1251(void) { unregister_nls(&table); } module_init(init_nls_cp1251) module_exit(exit_nls_cp1251) MODULE_DESCRIPTION("NLS Windows CP1251 (Bulgarian, Belarusian)"); MODULE_LICENSE("Dual BSD/GPL");
2251 2264 561 272 248 430 340 430 430 340 430 340 430 430 430 339 340 340 312 32 138 112 19 6 19 2 1 1 19 3642 41 41 3638 3638 3638 3642 33 33 4321 4314 97 97 708 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 // SPDX-License-Identifier: GPL-2.0 #include <linux/mm.h> #include <linux/gfp.h> #include <linux/hugetlb.h> #include <asm/pgalloc.h> #include <asm/tlb.h> #include <asm/fixmap.h> #include <asm/mtrr.h> #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; EXPORT_SYMBOL(physical_mask); SYM_PIC_ALIAS(physical_mask); #endif pgtable_t pte_alloc_one(struct mm_struct *mm) { return __pte_alloc_one(mm, GFP_PGTABLE_USER); } void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { paravirt_release_pte(page_to_pfn(pte)); tlb_remove_ptdesc(tlb, page_ptdesc(pte)); } #if CONFIG_PGTABLE_LEVELS > 2 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); /* * NOTE! For PAE, any changes to the top page-directory-pointer-table * entries need a full cr3 reload to flush. */ #ifdef CONFIG_X86_PAE tlb->need_flush_all = 1; #endif tlb_remove_ptdesc(tlb, virt_to_ptdesc(pmd)); } #if CONFIG_PGTABLE_LEVELS > 3 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); tlb_remove_ptdesc(tlb, virt_to_ptdesc(pud)); } #if CONFIG_PGTABLE_LEVELS > 4 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) { paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); tlb_remove_ptdesc(tlb, virt_to_ptdesc(p4d)); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #endif /* CONFIG_PGTABLE_LEVELS > 2 */ static inline void pgd_list_add(pgd_t *pgd) { struct ptdesc *ptdesc = virt_to_ptdesc(pgd); list_add(&ptdesc->pt_list, &pgd_list); } static inline void pgd_list_del(pgd_t *pgd) { struct ptdesc *ptdesc = virt_to_ptdesc(pgd); list_del(&ptdesc->pt_list); } static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) { virt_to_ptdesc(pgd)->pt_mm = mm; } struct mm_struct *pgd_page_get_mm(struct page *page) { return page_ptdesc(page)->pt_mm; } static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) { /* PAE preallocates all its PMDs. No cloning needed. */ if (!IS_ENABLED(CONFIG_X86_PAE)) clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, swapper_pg_dir + KERNEL_PGD_BOUNDARY, KERNEL_PGD_PTRS); /* List used to sync kernel mapping updates */ pgd_set_mm(pgd, mm); pgd_list_add(pgd); } static void pgd_dtor(pgd_t *pgd) { spin_lock(&pgd_lock); pgd_list_del(pgd); spin_unlock(&pgd_lock); } /* * List of all pgd's needed for non-PAE so it can invalidate entries * in both cached and uncached pgd's; not needed for PAE since the * kernel pmd is shared. If PAE were not to share the pmd a similar * tactic would be needed. This is essentially codepath-based locking * against pageattr.c; it is the unique case in which a valid change * of kernel pagetables can't be lazily synchronized by vmalloc faults. * vmalloc faults work because attached pagetables are never freed. * -- nyc */ #ifdef CONFIG_X86_PAE /* * In PAE mode, we need to do a cr3 reload (=tlb flush) when * updating the top-level pagetable entries to guarantee the * processor notices the update. Since this is expensive, and * all 4 top-level entries are used almost immediately in a * new process's life, we just pre-populate them here. */ #define PREALLOCATED_PMDS PTRS_PER_PGD /* * "USER_PMDS" are the PMDs for the user copy of the page tables when * PTI is enabled. They do not exist when PTI is disabled. Note that * this is distinct from the user _portion_ of the kernel page tables * which always exists. * * We allocate separate PMDs for the kernel part of the user page-table * when PTI is enabled. We need them to map the per-process LDT into the * user-space page-table. */ #define PREALLOCATED_USER_PMDS (boot_cpu_has(X86_FEATURE_PTI) ? \ KERNEL_PGD_PTRS : 0) #define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) { paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); /* Note: almost everything apart from _PAGE_PRESENT is reserved at the pmd (PDPT) level. */ set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); /* * According to Intel App note "TLBs, Paging-Structure Caches, * and Their Invalidation", April 2007, document 317080-001, * section 8.1: in PAE mode we explicitly have to flush the * TLB via cr3 if the top-level pgd is changed... */ flush_tlb_mm(mm); } #else /* !CONFIG_X86_PAE */ /* No need to prepopulate any pagetable entries in non-PAE modes. */ #define PREALLOCATED_PMDS 0 #define PREALLOCATED_USER_PMDS 0 #define MAX_PREALLOCATED_USER_PMDS 0 #endif /* CONFIG_X86_PAE */ static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) { int i; struct ptdesc *ptdesc; for (i = 0; i < count; i++) if (pmds[i]) { ptdesc = virt_to_ptdesc(pmds[i]); pagetable_dtor(ptdesc); pagetable_free(ptdesc); mm_dec_nr_pmds(mm); } } static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) { int i; bool failed = false; gfp_t gfp = GFP_PGTABLE_USER; if (mm == &init_mm) gfp &= ~__GFP_ACCOUNT; gfp &= ~__GFP_HIGHMEM; for (i = 0; i < count; i++) { pmd_t *pmd = NULL; struct ptdesc *ptdesc = pagetable_alloc(gfp, 0); if (!ptdesc) failed = true; if (ptdesc && !pagetable_pmd_ctor(mm, ptdesc)) { pagetable_free(ptdesc); ptdesc = NULL; failed = true; } if (ptdesc) { mm_inc_nr_pmds(mm); pmd = ptdesc_address(ptdesc); } pmds[i] = pmd; } if (failed) { free_pmds(mm, pmds, count); return -ENOMEM; } return 0; } /* * Mop up any pmd pages which may still be attached to the pgd. * Normally they will be freed by munmap/exit_mmap, but any pmd we * preallocate which never got a corresponding vma will need to be * freed manually. */ static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp) { pgd_t pgd = *pgdp; if (pgd_val(pgd) != 0) { pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); pgd_clear(pgdp); paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); pmd_free(mm, pmd); mm_dec_nr_pmds(mm); } } static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) { int i; for (i = 0; i < PREALLOCATED_PMDS; i++) mop_up_one_pmd(mm, &pgdp[i]); #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION if (!boot_cpu_has(X86_FEATURE_PTI)) return; pgdp = kernel_to_user_pgdp(pgdp); for (i = 0; i < PREALLOCATED_USER_PMDS; i++) mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]); #endif } static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) { p4d_t *p4d; pud_t *pud; int i; p4d = p4d_offset(pgd, 0); pud = pud_offset(p4d, 0); for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { pmd_t *pmd = pmds[i]; if (i >= KERNEL_PGD_BOUNDARY) memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), sizeof(pmd_t) * PTRS_PER_PMD); pud_populate(mm, pud, pmd); } } #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION static void pgd_prepopulate_user_pmd(struct mm_struct *mm, pgd_t *k_pgd, pmd_t *pmds[]) { pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir); pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); p4d_t *u_p4d; pud_t *u_pud; int i; u_p4d = p4d_offset(u_pgd, 0); u_pud = pud_offset(u_p4d, 0); s_pgd += KERNEL_PGD_BOUNDARY; u_pud += KERNEL_PGD_BOUNDARY; for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) { pmd_t *pmd = pmds[i]; memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd), sizeof(pmd_t) * PTRS_PER_PMD); pud_populate(mm, u_pud, pmd); } } #else static void pgd_prepopulate_user_pmd(struct mm_struct *mm, pgd_t *k_pgd, pmd_t *pmds[]) { } #endif static inline pgd_t *_pgd_alloc(struct mm_struct *mm) { /* * PTI and Xen need a whole page for the PAE PGD * even though the hardware only needs 32 bytes. * * For simplicity, allocate a page for all users. */ return __pgd_alloc(mm, pgd_allocation_order()); } static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd) { __pgd_free(mm, pgd); } pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS]; pmd_t *pmds[PREALLOCATED_PMDS]; pgd = _pgd_alloc(mm); if (pgd == NULL) goto out; mm->pgd = pgd; if (sizeof(pmds) != 0 && preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0) goto out_free_pgd; if (sizeof(u_pmds) != 0 && preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0) goto out_free_pmds; if (paravirt_pgd_alloc(mm) != 0) goto out_free_user_pmds; /* * Make sure that pre-populating the pmds is atomic with * respect to anything walking the pgd_list, so that they * never see a partially populated pgd. */ spin_lock(&pgd_lock); pgd_ctor(mm, pgd); if (sizeof(pmds) != 0) pgd_prepopulate_pmd(mm, pgd, pmds); if (sizeof(u_pmds) != 0) pgd_prepopulate_user_pmd(mm, pgd, u_pmds); spin_unlock(&pgd_lock); return pgd; out_free_user_pmds: if (sizeof(u_pmds) != 0) free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS); out_free_pmds: if (sizeof(pmds) != 0) free_pmds(mm, pmds, PREALLOCATED_PMDS); out_free_pgd: _pgd_free(mm, pgd); out: return NULL; } void pgd_free(struct mm_struct *mm, pgd_t *pgd) { pgd_mop_up_pmds(mm, pgd); pgd_dtor(pgd); paravirt_pgd_free(mm, pgd); _pgd_free(mm, pgd); } /* * Used to set accessed or dirty bits in the page table entries * on other architectures. On x86, the accessed and dirty bits * are tracked by hardware. However, do_wp_page calls this function * to also make the pte writeable at the same time the dirty bit is * set. In that case we do actually need to write the PTE. */ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) { int changed = !pte_same(*ptep, entry); if (changed && dirty) set_pte(ptep, entry); return changed; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty) { int changed = !pmd_same(*pmdp, entry); VM_BUG_ON(address & ~HPAGE_PMD_MASK); if (changed && dirty) { set_pmd(pmdp, entry); /* * We had a write-protection fault here and changed the pmd * to to more permissive. No need to flush the TLB for that, * #PF is architecturally guaranteed to do that and in the * worst-case we'll generate a spurious fault. */ } return changed; } int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t entry, int dirty) { int changed = !pud_same(*pudp, entry); VM_BUG_ON(address & ~HPAGE_PUD_MASK); if (changed && dirty) { set_pud(pudp, entry); /* * We had a write-protection fault here and changed the pud * to to more permissive. No need to flush the TLB for that, * #PF is architecturally guaranteed to do that and in the * worst-case we'll generate a spurious fault. */ } return changed; } #endif int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { int ret = 0; if (pte_young(*ptep)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *) &ptep->pte); return ret; } #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { int ret = 0; if (pmd_young(*pmdp)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *)pmdp); return ret; } #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE int pudp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp) { int ret = 0; if (pud_young(*pudp)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *)pudp); return ret; } #endif int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { /* * On x86 CPUs, clearing the accessed bit without a TLB flush * doesn't cause data corruption. [ It could cause incorrect * page aging and the (mistaken) reclaim of hot pages, but the * chance of that should be relatively low. ] * * So as a performance optimization don't flush the TLB when * clearing the accessed bit, it will eventually be flushed by * a context switch or a VM operation anyway. [ In the rare * event of it not getting flushed for a long time the delay * shouldn't really matter because there's no real memory * pressure for swapout to react to. ] */ return ptep_test_and_clear_young(vma, address, ptep); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { int young; VM_BUG_ON(address & ~HPAGE_PMD_MASK); young = pmdp_test_and_clear_young(vma, address, pmdp); if (young) flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return young; } pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { VM_WARN_ON_ONCE(!pmd_present(*pmdp)); /* * No flush is necessary. Once an invalid PTE is established, the PTE's * access and dirty bits cannot be updated. */ return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp)); } #endif #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, pud_t *pudp) { VM_WARN_ON_ONCE(!pud_present(*pudp)); pud_t old = pudp_establish(vma, address, pudp, pud_mkinvalid(*pudp)); flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); return old; } #endif /** * reserve_top_address - Reserve a hole in the top of the kernel address space * @reserve: Size of hole to reserve * * Can be used to relocate the fixmap area and poke a hole in the top * of the kernel address space to make room for a hypervisor. */ void __init reserve_top_address(unsigned long reserve) { #ifdef CONFIG_X86_32 BUG_ON(fixmaps_set > 0); __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE; printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n", -reserve, __FIXADDR_TOP + PAGE_SIZE); #endif } int fixmaps_set; void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) { unsigned long address = __fix_to_virt(idx); #ifdef CONFIG_X86_64 /* * Ensure that the static initial page tables are covering the * fixmap completely. */ BUILD_BUG_ON(__end_of_permanent_fixed_addresses > (FIXMAP_PMD_NUM * PTRS_PER_PTE)); #endif if (idx >= __end_of_fixed_addresses) { BUG(); return; } set_pte_vaddr(address, pte); fixmaps_set++; } void native_set_fixmap(unsigned /* enum fixed_addresses */ idx, phys_addr_t phys, pgprot_t flags) { /* Sanitize 'prot' against any unsupported bits: */ pgprot_val(flags) &= __default_kernel_pte_mask; __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); } #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP #if CONFIG_PGTABLE_LEVELS > 4 /** * p4d_set_huge - Set up kernel P4D mapping * @p4d: Pointer to the P4D entry * @addr: Virtual address associated with the P4D entry * @prot: Protection bits to use * * No 512GB pages yet -- always return 0 */ int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { return 0; } /** * p4d_clear_huge - Clear kernel P4D mapping when it is set * @p4d: Pointer to the P4D entry to clear * * No 512GB pages yet -- do nothing */ void p4d_clear_huge(p4d_t *p4d) { } #endif /** * pud_set_huge - Set up kernel PUD mapping * @pud: Pointer to the PUD entry * @addr: Virtual address associated with the PUD entry * @prot: Protection bits to use * * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this * function sets up a huge page only if the complete range has the same MTRR * caching mode. * * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger * page mapping attempt fails. * * Returns 1 on success and 0 on failure. */ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) { u8 uniform; mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform); if (!uniform) return 0; /* Bail out if we are we on a populated non-leaf entry: */ if (pud_present(*pud) && !pud_leaf(*pud)) return 0; set_pte((pte_t *)pud, pfn_pte( (u64)addr >> PAGE_SHIFT, __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); return 1; } /** * pmd_set_huge - Set up kernel PMD mapping * @pmd: Pointer to the PMD entry * @addr: Virtual address associated with the PMD entry * @prot: Protection bits to use * * See text over pud_set_huge() above. * * Returns 1 on success and 0 on failure. */ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) { u8 uniform; mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform); if (!uniform) { pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n", __func__, addr, addr + PMD_SIZE); return 0; } /* Bail out if we are we on a populated non-leaf entry: */ if (pmd_present(*pmd) && !pmd_leaf(*pmd)) return 0; set_pte((pte_t *)pmd, pfn_pte( (u64)addr >> PAGE_SHIFT, __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); return 1; } /** * pud_clear_huge - Clear kernel PUD mapping when it is set * @pud: Pointer to the PUD entry to clear. * * Returns 1 on success and 0 on failure (no PUD map is found). */ int pud_clear_huge(pud_t *pud) { if (pud_leaf(*pud)) { pud_clear(pud); return 1; } return 0; } /** * pmd_clear_huge - Clear kernel PMD mapping when it is set * @pmd: Pointer to the PMD entry to clear. * * Returns 1 on success and 0 on failure (no PMD map is found). */ int pmd_clear_huge(pmd_t *pmd) { if (pmd_leaf(*pmd)) { pmd_clear(pmd); return 1; } return 0; } #ifdef CONFIG_X86_64 /** * pud_free_pmd_page - Clear PUD entry and free PMD page * @pud: Pointer to a PUD * @addr: Virtual address associated with PUD * * Context: The PUD range has been unmapped and TLB purged. * Return: 1 if clearing the entry succeeded. 0 otherwise. * * NOTE: Callers must allow a single page allocation. */ int pud_free_pmd_page(pud_t *pud, unsigned long addr) { pmd_t *pmd, *pmd_sv; pte_t *pte; int i; pmd = pud_pgtable(*pud); pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL); if (!pmd_sv) return 0; for (i = 0; i < PTRS_PER_PMD; i++) { pmd_sv[i] = pmd[i]; if (!pmd_none(pmd[i])) pmd_clear(&pmd[i]); } pud_clear(pud); /* INVLPG to clear all paging-structure caches */ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); for (i = 0; i < PTRS_PER_PMD; i++) { if (!pmd_none(pmd_sv[i])) { pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]); pte_free_kernel(&init_mm, pte); } } free_page((unsigned long)pmd_sv); pmd_free(&init_mm, pmd); return 1; } /** * pmd_free_pte_page - Clear PMD entry and free PTE page. * @pmd: Pointer to the PMD * @addr: Virtual address associated with PMD * * Context: The PMD range has been unmapped and TLB purged. * Return: 1 if clearing the entry succeeded. 0 otherwise. */ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) { pte_t *pte; pte = (pte_t *)pmd_page_vaddr(*pmd); pmd_clear(pmd); /* INVLPG to clear all paging-structure caches */ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); pte_free_kernel(&init_mm, pte); return 1; } #else /* !CONFIG_X86_64 */ /* * Disable free page handling on x86-PAE. This assures that ioremap() * does not update sync'd PMD entries. See vmalloc_sync_one(). */ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) { return pmd_none(*pmd); } #endif /* CONFIG_X86_64 */ #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma) { if (vma->vm_flags & VM_SHADOW_STACK) return pte_mkwrite_shstk(pte); pte = pte_mkwrite_novma(pte); return pte_clear_saveddirty(pte); } pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { if (vma->vm_flags & VM_SHADOW_STACK) return pmd_mkwrite_shstk(pmd); pmd = pmd_mkwrite_novma(pmd); return pmd_clear_saveddirty(pmd); } void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte) { /* * Hardware before shadow stack can (rarely) set Dirty=1 * on a Write=0 PTE. So the below condition * only indicates a software bug when shadow stack is * supported by the HW. This checking is covered in * pte_shstk(). */ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pte_shstk(pte)); } void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd) { /* See note in arch_check_zapped_pte() */ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pmd_shstk(pmd)); } void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud) { /* See note in arch_check_zapped_pte() */ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pud_shstk(pud)); }
3 2 1 3 2 1 1 2 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 // SPDX-License-Identifier: GPL-2.0 /* * (C) 2001 Clemson University and The University of Chicago * * See COPYING in top-level directory. */ #include "protocol.h" #include "orangefs-kernel.h" #include "orangefs-bufmap.h" #include <linux/hashtable.h> #include <linux/seq_file.h> /* a cache for orangefs-inode objects (i.e. orangefs inode private data) */ static struct kmem_cache *orangefs_inode_cache; /* list for storing orangefs specific superblocks in use */ LIST_HEAD(orangefs_superblocks); DEFINE_SPINLOCK(orangefs_superblocks_lock); enum { Opt_acl, Opt_intr, Opt_local_lock, }; const struct fs_parameter_spec orangefs_fs_param_spec[] = { fsparam_flag ("acl", Opt_acl), fsparam_flag ("intr", Opt_intr), fsparam_flag ("local_lock", Opt_local_lock), {} }; uint64_t orangefs_features; static int orangefs_show_options(struct seq_file *m, struct dentry *root) { struct orangefs_sb_info_s *orangefs_sb = ORANGEFS_SB(root->d_sb); if (root->d_sb->s_flags & SB_POSIXACL) seq_puts(m, ",acl"); if (orangefs_sb->flags & ORANGEFS_OPT_INTR) seq_puts(m, ",intr"); if (orangefs_sb->flags & ORANGEFS_OPT_LOCAL_LOCK) seq_puts(m, ",local_lock"); return 0; } static int orangefs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct orangefs_sb_info_s *orangefs_sb = fc->s_fs_info; struct fs_parse_result result; int opt; opt = fs_parse(fc, orangefs_fs_param_spec, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_acl: fc->sb_flags |= SB_POSIXACL; break; case Opt_intr: orangefs_sb->flags |= ORANGEFS_OPT_INTR; break; case Opt_local_lock: orangefs_sb->flags |= ORANGEFS_OPT_LOCAL_LOCK; break; } return 0; } static void orangefs_inode_cache_ctor(void *req) { struct orangefs_inode_s *orangefs_inode = req; inode_init_once(&orangefs_inode->vfs_inode); init_rwsem(&orangefs_inode->xattr_sem); } static struct inode *orangefs_alloc_inode(struct super_block *sb) { struct orangefs_inode_s *orangefs_inode; orangefs_inode = alloc_inode_sb(sb, orangefs_inode_cache, GFP_KERNEL); if (!orangefs_inode) return NULL; /* * We want to clear everything except for rw_semaphore and the * vfs_inode. */ memset(&orangefs_inode->refn.khandle, 0, 16); orangefs_inode->refn.fs_id = ORANGEFS_FS_ID_NULL; orangefs_inode->last_failed_block_index_read = 0; memset(orangefs_inode->link_target, 0, sizeof(orangefs_inode->link_target)); gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_alloc_inode: allocated %p\n", &orangefs_inode->vfs_inode); return &orangefs_inode->vfs_inode; } static void orangefs_free_inode(struct inode *inode) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_cached_xattr *cx; struct hlist_node *tmp; int i; hash_for_each_safe(orangefs_inode->xattr_cache, i, tmp, cx, node) { hlist_del(&cx->node); kfree(cx); } kmem_cache_free(orangefs_inode_cache, orangefs_inode); } static void orangefs_destroy_inode(struct inode *inode) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); gossip_debug(GOSSIP_SUPER_DEBUG, "%s: deallocated %p destroying inode %pU\n", __func__, orangefs_inode, get_khandle_from_ino(inode)); } static int orangefs_write_inode(struct inode *inode, struct writeback_control *wbc) { gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_write_inode\n"); return orangefs_inode_setattr(inode); } /* * NOTE: information filled in here is typically reflected in the * output of the system command 'df' */ static int orangefs_statfs(struct dentry *dentry, struct kstatfs *buf) { int ret = -ENOMEM; struct orangefs_kernel_op_s *new_op = NULL; int flags = 0; struct super_block *sb = NULL; sb = dentry->d_sb; gossip_debug(GOSSIP_SUPER_DEBUG, "%s: called on sb %p (fs_id is %d)\n", __func__, sb, (int)(ORANGEFS_SB(sb)->fs_id)); new_op = op_alloc(ORANGEFS_VFS_OP_STATFS); if (!new_op) return ret; new_op->upcall.req.statfs.fs_id = ORANGEFS_SB(sb)->fs_id; if (ORANGEFS_SB(sb)->flags & ORANGEFS_OPT_INTR) flags = ORANGEFS_OP_INTERRUPTIBLE; ret = service_operation(new_op, "orangefs_statfs", flags); if (new_op->downcall.status < 0) goto out_op_release; gossip_debug(GOSSIP_SUPER_DEBUG, "%s: got %ld blocks available | " "%ld blocks total | %ld block size | " "%ld files total | %ld files avail\n", __func__, (long)new_op->downcall.resp.statfs.blocks_avail, (long)new_op->downcall.resp.statfs.blocks_total, (long)new_op->downcall.resp.statfs.block_size, (long)new_op->downcall.resp.statfs.files_total, (long)new_op->downcall.resp.statfs.files_avail); buf->f_type = sb->s_magic; buf->f_fsid.val[0] = ORANGEFS_SB(sb)->fs_id; buf->f_fsid.val[1] = ORANGEFS_SB(sb)->id; buf->f_bsize = new_op->downcall.resp.statfs.block_size; buf->f_namelen = ORANGEFS_NAME_MAX; buf->f_blocks = (sector_t) new_op->downcall.resp.statfs.blocks_total; buf->f_bfree = (sector_t) new_op->downcall.resp.statfs.blocks_avail; buf->f_bavail = (sector_t) new_op->downcall.resp.statfs.blocks_avail; buf->f_files = (sector_t) new_op->downcall.resp.statfs.files_total; buf->f_ffree = (sector_t) new_op->downcall.resp.statfs.files_avail; buf->f_frsize = 0; out_op_release: op_release(new_op); gossip_debug(GOSSIP_SUPER_DEBUG, "%s: returning %d\n", __func__, ret); return ret; } /* * Remount as initiated by VFS layer. We just need to reparse the mount * options, no need to signal pvfs2-client-core about it. */ static int orangefs_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; struct orangefs_sb_info_s *orangefs_sb = ORANGEFS_SB(sb); struct orangefs_sb_info_s *revised = fc->s_fs_info; unsigned int flags; flags = orangefs_sb->flags; flags &= ~(ORANGEFS_OPT_INTR | ORANGEFS_OPT_LOCAL_LOCK); flags |= revised->flags; WRITE_ONCE(orangefs_sb->flags, flags); gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_reconfigure: called\n"); return 0; } /* * Remount as initiated by pvfs2-client-core on restart. This is used to * repopulate mount information left from previous pvfs2-client-core. * * the idea here is that given a valid superblock, we're * re-initializing the user space client with the initial mount * information specified when the super block was first initialized. * this is very different than the first initialization/creation of a * superblock. we use the special service_priority_operation to make * sure that the mount gets ahead of any other pending operation that * is waiting for servicing. this means that the pvfs2-client won't * fail to start several times for all other pending operations before * the client regains all of the mount information from us. * NOTE: this function assumes that the request_mutex is already acquired! */ int orangefs_remount(struct orangefs_sb_info_s *orangefs_sb) { struct orangefs_kernel_op_s *new_op; int ret = -EINVAL; gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_remount: called\n"); new_op = op_alloc(ORANGEFS_VFS_OP_FS_MOUNT); if (!new_op) return -ENOMEM; strscpy(new_op->upcall.req.fs_mount.orangefs_config_server, orangefs_sb->devname); gossip_debug(GOSSIP_SUPER_DEBUG, "Attempting ORANGEFS Remount via host %s\n", new_op->upcall.req.fs_mount.orangefs_config_server); /* * we assume that the calling function has already acquired the * request_mutex to prevent other operations from bypassing * this one */ ret = service_operation(new_op, "orangefs_remount", ORANGEFS_OP_PRIORITY | ORANGEFS_OP_NO_MUTEX); gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_remount: mount got return value of %d\n", ret); if (ret == 0) { /* * store the id assigned to this sb -- it's just a * short-lived mapping that the system interface uses * to map this superblock to a particular mount entry */ orangefs_sb->id = new_op->downcall.resp.fs_mount.id; orangefs_sb->mount_pending = 0; } op_release(new_op); if (orangefs_userspace_version >= 20906) { new_op = op_alloc(ORANGEFS_VFS_OP_FEATURES); if (!new_op) return -ENOMEM; new_op->upcall.req.features.features = 0; ret = service_operation(new_op, "orangefs_features", ORANGEFS_OP_PRIORITY | ORANGEFS_OP_NO_MUTEX); if (!ret) orangefs_features = new_op->downcall.resp.features.features; else orangefs_features = 0; op_release(new_op); } else { orangefs_features = 0; } return ret; } int fsid_key_table_initialize(void) { return 0; } void fsid_key_table_finalize(void) { } static const struct super_operations orangefs_s_ops = { .alloc_inode = orangefs_alloc_inode, .free_inode = orangefs_free_inode, .destroy_inode = orangefs_destroy_inode, .write_inode = orangefs_write_inode, .drop_inode = inode_just_drop, .statfs = orangefs_statfs, .show_options = orangefs_show_options, }; static struct dentry *orangefs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { struct orangefs_object_kref refn; if (fh_len < 5 || fh_type > 2) return NULL; ORANGEFS_khandle_from(&(refn.khandle), fid->raw, 16); refn.fs_id = (u32) fid->raw[4]; gossip_debug(GOSSIP_SUPER_DEBUG, "fh_to_dentry: handle %pU, fs_id %d\n", &refn.khandle, refn.fs_id); return d_obtain_alias(orangefs_iget(sb, &refn)); } static int orangefs_encode_fh(struct inode *inode, __u32 *fh, int *max_len, struct inode *parent) { int len = parent ? 10 : 5; int type = 1; struct orangefs_object_kref refn; if (*max_len < len) { gossip_err("fh buffer is too small for encoding\n"); *max_len = len; type = 255; goto out; } refn = ORANGEFS_I(inode)->refn; ORANGEFS_khandle_to(&refn.khandle, fh, 16); fh[4] = refn.fs_id; gossip_debug(GOSSIP_SUPER_DEBUG, "Encoding fh: handle %pU, fsid %u\n", &refn.khandle, refn.fs_id); if (parent) { refn = ORANGEFS_I(parent)->refn; ORANGEFS_khandle_to(&refn.khandle, (char *) fh + 20, 16); fh[9] = refn.fs_id; type = 2; gossip_debug(GOSSIP_SUPER_DEBUG, "Encoding parent: handle %pU, fsid %u\n", &refn.khandle, refn.fs_id); } *max_len = len; out: return type; } static const struct export_operations orangefs_export_ops = { .encode_fh = orangefs_encode_fh, .fh_to_dentry = orangefs_fh_to_dentry, }; static int orangefs_unmount(int id, __s32 fs_id, const char *devname) { struct orangefs_kernel_op_s *op; int r; op = op_alloc(ORANGEFS_VFS_OP_FS_UMOUNT); if (!op) return -ENOMEM; op->upcall.req.fs_umount.id = id; op->upcall.req.fs_umount.fs_id = fs_id; strscpy(op->upcall.req.fs_umount.orangefs_config_server, devname); r = service_operation(op, "orangefs_fs_umount", 0); /* Not much to do about an error here. */ if (r) gossip_err("orangefs_unmount: service_operation %d\n", r); op_release(op); return r; } static int orangefs_fill_sb(struct super_block *sb, struct fs_context *fc, struct orangefs_fs_mount_response *fs_mount) { int ret; struct inode *root; struct dentry *root_dentry; struct orangefs_object_kref root_object; ORANGEFS_SB(sb)->sb = sb; ORANGEFS_SB(sb)->root_khandle = fs_mount->root_khandle; ORANGEFS_SB(sb)->fs_id = fs_mount->fs_id; ORANGEFS_SB(sb)->id = fs_mount->id; /* Hang the xattr handlers off the superblock */ sb->s_xattr = orangefs_xattr_handlers; sb->s_magic = ORANGEFS_SUPER_MAGIC; sb->s_op = &orangefs_s_ops; set_default_d_op(sb, &orangefs_dentry_operations); sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_maxbytes = MAX_LFS_FILESIZE; ret = super_setup_bdi(sb); if (ret) return ret; root_object.khandle = ORANGEFS_SB(sb)->root_khandle; root_object.fs_id = ORANGEFS_SB(sb)->fs_id; gossip_debug(GOSSIP_SUPER_DEBUG, "get inode %pU, fsid %d\n", &root_object.khandle, root_object.fs_id); root = orangefs_iget(sb, &root_object); if (IS_ERR(root)) return PTR_ERR(root); gossip_debug(GOSSIP_SUPER_DEBUG, "Allocated root inode [%p] with mode %x\n", root, root->i_mode); /* allocates and places root dentry in dcache */ root_dentry = d_make_root(root); if (!root_dentry) return -ENOMEM; sb->s_export_op = &orangefs_export_ops; sb->s_root = root_dentry; return 0; } static int orangefs_get_tree(struct fs_context *fc) { int ret; struct super_block *sb = ERR_PTR(-EINVAL); struct orangefs_kernel_op_s *new_op; if (!fc->source) return invalf(fc, "Device name not specified.\n"); gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_mount: called with devname %s\n", fc->source); new_op = op_alloc(ORANGEFS_VFS_OP_FS_MOUNT); if (!new_op) return -ENOMEM; strscpy(new_op->upcall.req.fs_mount.orangefs_config_server, fc->source); gossip_debug(GOSSIP_SUPER_DEBUG, "Attempting ORANGEFS Mount via host %s\n", new_op->upcall.req.fs_mount.orangefs_config_server); ret = service_operation(new_op, "orangefs_mount", 0); gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_mount: mount got return value of %d\n", ret); if (ret) goto free_op; if (new_op->downcall.resp.fs_mount.fs_id == ORANGEFS_FS_ID_NULL) { gossip_err("ERROR: Retrieved null fs_id\n"); ret = -EINVAL; goto free_op; } sb = sget_fc(fc, NULL, set_anon_super_fc); if (IS_ERR(sb)) { ret = PTR_ERR(sb); orangefs_unmount(new_op->downcall.resp.fs_mount.id, new_op->downcall.resp.fs_mount.fs_id, fc->source); goto free_op; } /* init our private orangefs sb info */ ret = orangefs_fill_sb(sb, fc, &new_op->downcall.resp.fs_mount); if (ret) goto free_sb_and_op; /* * on successful mount, store the devname and data * used */ strscpy(ORANGEFS_SB(sb)->devname, fc->source); /* mount_pending must be cleared */ ORANGEFS_SB(sb)->mount_pending = 0; /* * finally, add this sb to our list of known orangefs * sb's */ gossip_debug(GOSSIP_SUPER_DEBUG, "Adding SB %p to orangefs superblocks\n", ORANGEFS_SB(sb)); spin_lock(&orangefs_superblocks_lock); list_add_tail(&ORANGEFS_SB(sb)->list, &orangefs_superblocks); spin_unlock(&orangefs_superblocks_lock); op_release(new_op); /* Must be removed from the list now. */ ORANGEFS_SB(sb)->no_list = 0; if (orangefs_userspace_version >= 20906) { new_op = op_alloc(ORANGEFS_VFS_OP_FEATURES); if (!new_op) return -ENOMEM; new_op->upcall.req.features.features = 0; ret = service_operation(new_op, "orangefs_features", 0); orangefs_features = new_op->downcall.resp.features.features; op_release(new_op); } else { orangefs_features = 0; } fc->root = dget(sb->s_root); return 0; free_sb_and_op: /* Will call orangefs_kill_sb with sb not in list. */ ORANGEFS_SB(sb)->no_list = 1; /* ORANGEFS_VFS_OP_FS_UMOUNT is done by orangefs_kill_sb. */ deactivate_locked_super(sb); free_op: gossip_err("orangefs_mount: mount request failed with %d\n", ret); if (ret == -EINVAL) { gossip_err("Ensure that all orangefs-servers have the same FS configuration files\n"); gossip_err("Look at pvfs2-client-core log file (typically /tmp/pvfs2-client.log) for more details\n"); } op_release(new_op); return ret; } static void orangefs_free_fc(struct fs_context *fc) { kfree(fc->s_fs_info); } static const struct fs_context_operations orangefs_context_ops = { .free = orangefs_free_fc, .parse_param = orangefs_parse_param, .get_tree = orangefs_get_tree, .reconfigure = orangefs_reconfigure, }; /* * Set up the filesystem mount context. */ int orangefs_init_fs_context(struct fs_context *fc) { struct orangefs_sb_info_s *osi; osi = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL); if (!osi) return -ENOMEM; /* * Force any potential flags that might be set from the mount * to zero, ie, initialize to unset. */ fc->sb_flags_mask &= ~SB_POSIXACL; osi->flags &= ~ORANGEFS_OPT_INTR; osi->flags &= ~ORANGEFS_OPT_LOCAL_LOCK; fc->s_fs_info = osi; fc->ops = &orangefs_context_ops; return 0; } void orangefs_kill_sb(struct super_block *sb) { int r; gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_kill_sb: called\n"); /* provided sb cleanup */ kill_anon_super(sb); if (!ORANGEFS_SB(sb)) { mutex_lock(&orangefs_request_mutex); mutex_unlock(&orangefs_request_mutex); return; } /* * issue the unmount to userspace to tell it to remove the * dynamic mount info it has for this superblock */ r = orangefs_unmount(ORANGEFS_SB(sb)->id, ORANGEFS_SB(sb)->fs_id, ORANGEFS_SB(sb)->devname); if (!r) ORANGEFS_SB(sb)->mount_pending = 1; if (!ORANGEFS_SB(sb)->no_list) { /* remove the sb from our list of orangefs specific sb's */ spin_lock(&orangefs_superblocks_lock); /* not list_del_init */ __list_del_entry(&ORANGEFS_SB(sb)->list); ORANGEFS_SB(sb)->list.prev = NULL; spin_unlock(&orangefs_superblocks_lock); } /* * make sure that ORANGEFS_DEV_REMOUNT_ALL loop that might've seen us * gets completed before we free the dang thing. */ mutex_lock(&orangefs_request_mutex); mutex_unlock(&orangefs_request_mutex); /* free the orangefs superblock private data */ kfree(ORANGEFS_SB(sb)); } int orangefs_inode_cache_initialize(void) { orangefs_inode_cache = kmem_cache_create_usercopy( "orangefs_inode_cache", sizeof(struct orangefs_inode_s), 0, 0, offsetof(struct orangefs_inode_s, link_target), sizeof_field(struct orangefs_inode_s, link_target), orangefs_inode_cache_ctor); if (!orangefs_inode_cache) { gossip_err("Cannot create orangefs_inode_cache\n"); return -ENOMEM; } return 0; } int orangefs_inode_cache_finalize(void) { kmem_cache_destroy(orangefs_inode_cache); return 0; }
5 5 4 3 2 2 3 3 3 3 3 1 2 2 2 1 1 1 1 1 2 2 2 2 2 2 2 1 1 1 1 1 4 1 1 4 1 4 1 5 5 5 5 7 1 2 5 5 5 5 1 5 1 4 1 4 4 1 5 5 10 1 1 9 1 9 3 3 3 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 // SPDX-License-Identifier: LGPL-2.1 /* * Copyright IBM Corporation, 2007 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> * */ #include <linux/slab.h> #include "ext4_jbd2.h" #include "ext4_extents.h" /* * The contiguous blocks details which can be * represented by a single extent */ struct migrate_struct { ext4_lblk_t first_block, last_block, curr_block; ext4_fsblk_t first_pblock, last_pblock; }; static int finish_range(handle_t *handle, struct inode *inode, struct migrate_struct *lb) { int retval = 0, needed; struct ext4_extent newext; struct ext4_ext_path *path; if (lb->first_pblock == 0) return 0; /* Add the extent to temp inode*/ newext.ee_block = cpu_to_le32(lb->first_block); newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1); ext4_ext_store_pblock(&newext, lb->first_pblock); /* Locking only for convenience since we are operating on temp inode */ down_write(&EXT4_I(inode)->i_data_sem); path = ext4_find_extent(inode, lb->first_block, NULL, 0); if (IS_ERR(path)) { retval = PTR_ERR(path); goto err_out; } /* * Calculate the credit needed to inserting this extent * Since we are doing this in loop we may accumulate extra * credit. But below we try to not accumulate too much * of them by restarting the journal. */ needed = ext4_ext_calc_credits_for_single_extent(inode, lb->last_block - lb->first_block + 1, path); retval = ext4_datasem_ensure_credits(handle, inode, needed, needed, 0); if (retval < 0) goto err_out; path = ext4_ext_insert_extent(handle, inode, path, &newext, 0); if (IS_ERR(path)) retval = PTR_ERR(path); err_out: up_write((&EXT4_I(inode)->i_data_sem)); ext4_free_ext_path(path); lb->first_pblock = 0; return retval; } static int update_extent_range(handle_t *handle, struct inode *inode, ext4_fsblk_t pblock, struct migrate_struct *lb) { int retval; /* * See if we can add on to the existing range (if it exists) */ if (lb->first_pblock && (lb->last_pblock+1 == pblock) && (lb->last_block+1 == lb->curr_block)) { lb->last_pblock = pblock; lb->last_block = lb->curr_block; lb->curr_block++; return 0; } /* * Start a new range. */ retval = finish_range(handle, inode, lb); lb->first_pblock = lb->last_pblock = pblock; lb->first_block = lb->last_block = lb->curr_block; lb->curr_block++; return retval; } static int update_ind_extent_range(handle_t *handle, struct inode *inode, ext4_fsblk_t pblock, struct migrate_struct *lb) { struct buffer_head *bh; __le32 *i_data; int i, retval = 0; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; bh = ext4_sb_bread(inode->i_sb, pblock, 0); if (IS_ERR(bh)) return PTR_ERR(bh); i_data = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { if (i_data[i]) { retval = update_extent_range(handle, inode, le32_to_cpu(i_data[i]), lb); if (retval) break; } else { lb->curr_block++; } } put_bh(bh); return retval; } static int update_dind_extent_range(handle_t *handle, struct inode *inode, ext4_fsblk_t pblock, struct migrate_struct *lb) { struct buffer_head *bh; __le32 *i_data; int i, retval = 0; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; bh = ext4_sb_bread(inode->i_sb, pblock, 0); if (IS_ERR(bh)) return PTR_ERR(bh); i_data = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { if (i_data[i]) { retval = update_ind_extent_range(handle, inode, le32_to_cpu(i_data[i]), lb); if (retval) break; } else { /* Only update the file block number */ lb->curr_block += max_entries; } } put_bh(bh); return retval; } static int update_tind_extent_range(handle_t *handle, struct inode *inode, ext4_fsblk_t pblock, struct migrate_struct *lb) { struct buffer_head *bh; __le32 *i_data; int i, retval = 0; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; bh = ext4_sb_bread(inode->i_sb, pblock, 0); if (IS_ERR(bh)) return PTR_ERR(bh); i_data = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { if (i_data[i]) { retval = update_dind_extent_range(handle, inode, le32_to_cpu(i_data[i]), lb); if (retval) break; } else { /* Only update the file block number */ lb->curr_block += max_entries * max_entries; } } put_bh(bh); return retval; } static int free_dind_blocks(handle_t *handle, struct inode *inode, __le32 i_data) { int i; __le32 *tmp_idata; struct buffer_head *bh; struct super_block *sb = inode->i_sb; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; int err; bh = ext4_sb_bread(sb, le32_to_cpu(i_data), 0); if (IS_ERR(bh)) return PTR_ERR(bh); tmp_idata = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { if (tmp_idata[i]) { err = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, ext4_free_metadata_revoke_credits(sb, 1)); if (err < 0) { put_bh(bh); return err; } ext4_free_blocks(handle, inode, NULL, le32_to_cpu(tmp_idata[i]), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } } put_bh(bh); err = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, ext4_free_metadata_revoke_credits(sb, 1)); if (err < 0) return err; ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); return 0; } static int free_tind_blocks(handle_t *handle, struct inode *inode, __le32 i_data) { int i, retval = 0; __le32 *tmp_idata; struct buffer_head *bh; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; bh = ext4_sb_bread(inode->i_sb, le32_to_cpu(i_data), 0); if (IS_ERR(bh)) return PTR_ERR(bh); tmp_idata = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { if (tmp_idata[i]) { retval = free_dind_blocks(handle, inode, tmp_idata[i]); if (retval) { put_bh(bh); return retval; } } } put_bh(bh); retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, ext4_free_metadata_revoke_credits(inode->i_sb, 1)); if (retval < 0) return retval; ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); return 0; } static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data) { int retval; /* ei->i_data[EXT4_IND_BLOCK] */ if (i_data[0]) { retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, ext4_free_metadata_revoke_credits(inode->i_sb, 1)); if (retval < 0) return retval; ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data[0]), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } /* ei->i_data[EXT4_DIND_BLOCK] */ if (i_data[1]) { retval = free_dind_blocks(handle, inode, i_data[1]); if (retval) return retval; } /* ei->i_data[EXT4_TIND_BLOCK] */ if (i_data[2]) { retval = free_tind_blocks(handle, inode, i_data[2]); if (retval) return retval; } return 0; } static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, struct inode *tmp_inode) { int retval, retval2 = 0; __le32 i_data[3]; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_inode_info *tmp_ei = EXT4_I(tmp_inode); /* * One credit accounted for writing the * i_data field of the original inode */ retval = ext4_journal_ensure_credits(handle, 1, 0); if (retval < 0) goto err_out; i_data[0] = ei->i_data[EXT4_IND_BLOCK]; i_data[1] = ei->i_data[EXT4_DIND_BLOCK]; i_data[2] = ei->i_data[EXT4_TIND_BLOCK]; down_write(&EXT4_I(inode)->i_data_sem); /* * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation * happened after we started the migrate. We need to * fail the migrate */ if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) { retval = -EAGAIN; up_write(&EXT4_I(inode)->i_data_sem); goto err_out; } else ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); /* * We have the extent map build with the tmp inode. * Now copy the i_data across */ ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data)); /* * Update i_blocks with the new blocks that got * allocated while adding extents for extent index * blocks. * * While converting to extents we need not * update the original inode i_blocks for extent blocks * via quota APIs. The quota update happened via tmp_inode already. */ spin_lock(&inode->i_lock); inode->i_blocks += tmp_inode->i_blocks; spin_unlock(&inode->i_lock); up_write(&EXT4_I(inode)->i_data_sem); /* * We mark the inode dirty after, because we decrement the * i_blocks when freeing the indirect meta-data blocks */ retval = free_ind_block(handle, inode, i_data); retval2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(retval2 && !retval)) retval = retval2; err_out: return retval; } static int free_ext_idx(handle_t *handle, struct inode *inode, struct ext4_extent_idx *ix) { int i, retval = 0; ext4_fsblk_t block; struct buffer_head *bh; struct ext4_extent_header *eh; block = ext4_idx_pblock(ix); bh = ext4_sb_bread(inode->i_sb, block, 0); if (IS_ERR(bh)) return PTR_ERR(bh); eh = (struct ext4_extent_header *)bh->b_data; if (eh->eh_depth != 0) { ix = EXT_FIRST_INDEX(eh); for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) { retval = free_ext_idx(handle, inode, ix); if (retval) { put_bh(bh); return retval; } } } put_bh(bh); retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, ext4_free_metadata_revoke_credits(inode->i_sb, 1)); if (retval < 0) return retval; ext4_free_blocks(handle, inode, NULL, block, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); return 0; } /* * Free the extent meta data blocks only */ static int free_ext_block(handle_t *handle, struct inode *inode) { int i, retval = 0; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_extent_header *eh = (struct ext4_extent_header *)ei->i_data; struct ext4_extent_idx *ix; if (eh->eh_depth == 0) /* * No extra blocks allocated for extent meta data */ return 0; ix = EXT_FIRST_INDEX(eh); for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) { retval = free_ext_idx(handle, inode, ix); if (retval) return retval; } return retval; } int ext4_ext_migrate(struct inode *inode) { handle_t *handle; int retval = 0, i; __le32 *i_data; struct ext4_inode_info *ei; struct inode *tmp_inode = NULL; struct migrate_struct lb; unsigned long max_entries; __u32 goal, tmp_csum_seed; uid_t owner[2]; int alloc_ctx; /* * If the filesystem does not support extents, or the inode * already is extent-based, error out. */ if (!ext4_has_feature_extents(inode->i_sb) || ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || ext4_has_inline_data(inode)) return -EINVAL; if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) /* * don't migrate fast symlink */ return retval; alloc_ctx = ext4_writepages_down_write(inode->i_sb); /* * Worst case we can touch the allocation bitmaps and a block * group descriptor block. We do need to worry about * credits for modifying the quota inode. */ handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); if (IS_ERR(handle)) { retval = PTR_ERR(handle); goto out_unlock; } goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; owner[0] = i_uid_read(inode); owner[1] = i_gid_read(inode); tmp_inode = ext4_new_inode(handle, d_inode(inode->i_sb->s_root), S_IFREG, NULL, goal, owner, 0); if (IS_ERR(tmp_inode)) { retval = PTR_ERR(tmp_inode); ext4_journal_stop(handle); goto out_unlock; } /* * Use the correct seed for checksum (i.e. the seed from 'inode'). This * is so that the metadata blocks will have the correct checksum after * the migration. */ ei = EXT4_I(inode); tmp_csum_seed = EXT4_I(tmp_inode)->i_csum_seed; EXT4_I(tmp_inode)->i_csum_seed = ei->i_csum_seed; i_size_write(tmp_inode, i_size_read(inode)); /* * Set the i_nlink to zero so it will be deleted later * when we drop inode reference. */ clear_nlink(tmp_inode); ext4_ext_tree_init(handle, tmp_inode); ext4_journal_stop(handle); /* * start with one credit accounted for * superblock modification. * * For the tmp_inode we already have committed the * transaction that created the inode. Later as and * when we add extents we extent the journal */ /* * Even though we take i_rwsem we can still cause block * allocation via mmap write to holes. If we have allocated * new blocks we fail migrate. New block allocation will * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated * with i_data_sem held to prevent racing with block * allocation. */ down_read(&EXT4_I(inode)->i_data_sem); ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE); up_read((&EXT4_I(inode)->i_data_sem)); handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); if (IS_ERR(handle)) { retval = PTR_ERR(handle); goto out_tmp_inode; } i_data = ei->i_data; memset(&lb, 0, sizeof(lb)); /* 32 bit block address 4 bytes */ max_entries = inode->i_sb->s_blocksize >> 2; for (i = 0; i < EXT4_NDIR_BLOCKS; i++) { if (i_data[i]) { retval = update_extent_range(handle, tmp_inode, le32_to_cpu(i_data[i]), &lb); if (retval) goto err_out; } else lb.curr_block++; } if (i_data[EXT4_IND_BLOCK]) { retval = update_ind_extent_range(handle, tmp_inode, le32_to_cpu(i_data[EXT4_IND_BLOCK]), &lb); if (retval) goto err_out; } else lb.curr_block += max_entries; if (i_data[EXT4_DIND_BLOCK]) { retval = update_dind_extent_range(handle, tmp_inode, le32_to_cpu(i_data[EXT4_DIND_BLOCK]), &lb); if (retval) goto err_out; } else lb.curr_block += max_entries * max_entries; if (i_data[EXT4_TIND_BLOCK]) { retval = update_tind_extent_range(handle, tmp_inode, le32_to_cpu(i_data[EXT4_TIND_BLOCK]), &lb); if (retval) goto err_out; } /* * Build the last extent */ retval = finish_range(handle, tmp_inode, &lb); err_out: if (retval) /* * Failure case delete the extent information with the * tmp_inode */ free_ext_block(handle, tmp_inode); else { retval = ext4_ext_swap_inode_data(handle, inode, tmp_inode); if (retval) /* * if we fail to swap inode data free the extent * details of the tmp inode */ free_ext_block(handle, tmp_inode); } /* We mark the tmp_inode dirty via ext4_ext_tree_init. */ retval = ext4_journal_ensure_credits(handle, 1, 0); if (retval < 0) goto out_stop; /* * Mark the tmp_inode as of size zero */ i_size_write(tmp_inode, 0); /* * set the i_blocks count to zero * so that the ext4_evict_inode() does the * right job * * We don't need to take the i_lock because * the inode is not visible to user space. */ tmp_inode->i_blocks = 0; EXT4_I(tmp_inode)->i_csum_seed = tmp_csum_seed; /* Reset the extent details */ ext4_ext_tree_init(handle, tmp_inode); out_stop: ext4_journal_stop(handle); out_tmp_inode: unlock_new_inode(tmp_inode); iput(tmp_inode); out_unlock: ext4_writepages_up_write(inode->i_sb, alloc_ctx); return retval; } /* * Migrate a simple extent-based inode to use the i_blocks[] array */ int ext4_ind_migrate(struct inode *inode) { struct ext4_extent_header *eh; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_super_block *es = sbi->s_es; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_extent *ex; unsigned int i, len; ext4_lblk_t start, end; ext4_fsblk_t blk; handle_t *handle; int ret, ret2 = 0; int alloc_ctx; if (!ext4_has_feature_extents(inode->i_sb) || (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EINVAL; if (ext4_has_feature_bigalloc(inode->i_sb)) return -EOPNOTSUPP; /* * In order to get correct extent info, force all delayed allocation * blocks to be allocated, otherwise delayed allocation blocks may not * be reflected and bypass the checks on extent header. */ if (test_opt(inode->i_sb, DELALLOC)) ext4_alloc_da_blocks(inode); alloc_ctx = ext4_writepages_down_write(inode->i_sb); handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out_unlock; } down_write(&EXT4_I(inode)->i_data_sem); ret = ext4_ext_check_inode(inode); if (ret) goto errout; eh = ext_inode_hdr(inode); ex = EXT_FIRST_EXTENT(eh); if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS || eh->eh_depth != 0 || le16_to_cpu(eh->eh_entries) > 1) { ret = -EOPNOTSUPP; goto errout; } if (eh->eh_entries == 0) blk = len = start = end = 0; else { len = le16_to_cpu(ex->ee_len); blk = ext4_ext_pblock(ex); start = le32_to_cpu(ex->ee_block); end = start + len - 1; if (end >= EXT4_NDIR_BLOCKS) { ret = -EOPNOTSUPP; goto errout; } } ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); memset(ei->i_data, 0, sizeof(ei->i_data)); for (i = start; i <= end; i++) ei->i_data[i] = cpu_to_le32(blk++); ret2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret2 && !ret)) ret = ret2; errout: up_write(&EXT4_I(inode)->i_data_sem); ext4_journal_stop(handle); out_unlock: ext4_writepages_up_write(inode->i_sb, alloc_ctx); return ret; }
5 22 2456 1 74 3689 248 309 56 18 42 39 3246 12 4366 3205 4314 30 21 2 3205 94 106 4 4 1 94 42 12310 10 12 1125 3 1305 75 939 10 731 78 1085 15751 8544 15637 89 480 89 480 268 8 20 1110 1110 430 7 1 1 8857 8313 4 814 773 46 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PGTABLE_H #define _ASM_X86_PGTABLE_H #include <linux/mem_encrypt.h> #include <asm/page.h> #include <asm/pgtable_types.h> /* * Macro to mark a page protection value as UC- */ #define pgprot_noncached(prot) \ ((boot_cpu_data.x86 > 3) \ ? (__pgprot(pgprot_val(prot) | \ cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS))) \ : (prot)) #ifndef __ASSEMBLER__ #include <linux/spinlock.h> #include <asm/x86_init.h> #include <asm/pkru.h> #include <asm/fpu/api.h> #include <asm/coco.h> #include <asm-generic/pgtable_uffd.h> #include <linux/page_table_check.h> extern pgd_t early_top_pgt[PTRS_PER_PGD]; bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd); struct seq_file; void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm); void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm, bool user); bool ptdump_walk_pgd_level_checkwx(void); #define ptdump_check_wx ptdump_walk_pgd_level_checkwx void ptdump_walk_user_pgd_level_checkwx(void); /* * Macros to add or remove encryption attribute */ #define pgprot_encrypted(prot) __pgprot(cc_mkenc(pgprot_val(prot))) #define pgprot_decrypted(prot) __pgprot(cc_mkdec(pgprot_val(prot))) #ifdef CONFIG_DEBUG_WX #define debug_checkwx_user() ptdump_walk_user_pgd_level_checkwx() #else #define debug_checkwx_user() do { } while (0) #endif /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. */ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __visible; #define ZERO_PAGE(vaddr) ((void)(vaddr),virt_to_page(empty_zero_page)) extern spinlock_t pgd_lock; extern struct list_head pgd_list; extern struct mm_struct *pgd_page_get_mm(struct page *page); extern pmdval_t early_pmd_flags; #ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else /* !CONFIG_PARAVIRT_XXL */ #define set_pte(ptep, pte) native_set_pte(ptep, pte) #define set_pte_atomic(ptep, pte) \ native_set_pte_atomic(ptep, pte) #define set_pmd(pmdp, pmd) native_set_pmd(pmdp, pmd) #ifndef __PAGETABLE_P4D_FOLDED #define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd) #define pgd_clear(pgd) (pgtable_l5_enabled() ? native_pgd_clear(pgd) : 0) #endif #ifndef set_p4d # define set_p4d(p4dp, p4d) native_set_p4d(p4dp, p4d) #endif #ifndef __PAGETABLE_PUD_FOLDED #define p4d_clear(p4d) native_p4d_clear(p4d) #endif #ifndef set_pud # define set_pud(pudp, pud) native_set_pud(pudp, pud) #endif #ifndef __PAGETABLE_PUD_FOLDED #define pud_clear(pud) native_pud_clear(pud) #endif #define pte_clear(mm, addr, ptep) native_pte_clear(mm, addr, ptep) #define pmd_clear(pmd) native_pmd_clear(pmd) #define pgd_val(x) native_pgd_val(x) #define __pgd(x) native_make_pgd(x) #ifndef __PAGETABLE_P4D_FOLDED #define p4d_val(x) native_p4d_val(x) #define __p4d(x) native_make_p4d(x) #endif #ifndef __PAGETABLE_PUD_FOLDED #define pud_val(x) native_pud_val(x) #define __pud(x) native_make_pud(x) #endif #ifndef __PAGETABLE_PMD_FOLDED #define pmd_val(x) native_pmd_val(x) #define __pmd(x) native_make_pmd(x) #endif #define pte_val(x) native_pte_val(x) #define __pte(x) native_make_pte(x) #define arch_end_context_switch(prev) do {} while(0) #endif /* CONFIG_PARAVIRT_XXL */ static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) { pmdval_t v = native_pmd_val(pmd); return native_make_pmd(v | set); } static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) { pmdval_t v = native_pmd_val(pmd); return native_make_pmd(v & ~clear); } static inline pud_t pud_set_flags(pud_t pud, pudval_t set) { pudval_t v = native_pud_val(pud); return native_make_pud(v | set); } static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear) { pudval_t v = native_pud_val(pud); return native_make_pud(v & ~clear); } /* * The following only work if pte_present() is true. * Undefined behaviour if not.. */ static inline bool pte_dirty(pte_t pte) { return pte_flags(pte) & _PAGE_DIRTY_BITS; } static inline bool pte_shstk(pte_t pte) { return cpu_feature_enabled(X86_FEATURE_SHSTK) && (pte_flags(pte) & (_PAGE_RW | _PAGE_DIRTY)) == _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { return pte_flags(pte) & _PAGE_ACCESSED; } static inline bool pte_decrypted(pte_t pte) { return cc_mkdec(pte_val(pte)) == pte_val(pte); } #define pmd_dirty pmd_dirty static inline bool pmd_dirty(pmd_t pmd) { return pmd_flags(pmd) & _PAGE_DIRTY_BITS; } static inline bool pmd_shstk(pmd_t pmd) { return cpu_feature_enabled(X86_FEATURE_SHSTK) && (pmd_flags(pmd) & (_PAGE_RW | _PAGE_DIRTY | _PAGE_PSE)) == (_PAGE_DIRTY | _PAGE_PSE); } #define pmd_young pmd_young static inline int pmd_young(pmd_t pmd) { return pmd_flags(pmd) & _PAGE_ACCESSED; } static inline bool pud_dirty(pud_t pud) { return pud_flags(pud) & _PAGE_DIRTY_BITS; } static inline int pud_young(pud_t pud) { return pud_flags(pud) & _PAGE_ACCESSED; } static inline bool pud_shstk(pud_t pud) { return cpu_feature_enabled(X86_FEATURE_SHSTK) && (pud_flags(pud) & (_PAGE_RW | _PAGE_DIRTY | _PAGE_PSE)) == (_PAGE_DIRTY | _PAGE_PSE); } static inline int pte_write(pte_t pte) { /* * Shadow stack pages are logically writable, but do not have * _PAGE_RW. Check for them separately from _PAGE_RW itself. */ return (pte_flags(pte) & _PAGE_RW) || pte_shstk(pte); } #define pmd_write pmd_write static inline int pmd_write(pmd_t pmd) { /* * Shadow stack pages are logically writable, but do not have * _PAGE_RW. Check for them separately from _PAGE_RW itself. */ return (pmd_flags(pmd) & _PAGE_RW) || pmd_shstk(pmd); } #define pud_write pud_write static inline int pud_write(pud_t pud) { return pud_flags(pud) & _PAGE_RW; } static inline int pte_huge(pte_t pte) { return pte_flags(pte) & _PAGE_PSE; } static inline int pte_global(pte_t pte) { return pte_flags(pte) & _PAGE_GLOBAL; } static inline int pte_exec(pte_t pte) { return !(pte_flags(pte) & _PAGE_NX); } static inline int pte_special(pte_t pte) { return pte_flags(pte) & _PAGE_SPECIAL; } /* Entries that were set to PROT_NONE are inverted */ static inline u64 protnone_mask(u64 val); #define PFN_PTE_SHIFT PAGE_SHIFT static inline unsigned long pte_pfn(pte_t pte) { phys_addr_t pfn = pte_val(pte); pfn ^= protnone_mask(pfn); return (pfn & PTE_PFN_MASK) >> PAGE_SHIFT; } static inline unsigned long pmd_pfn(pmd_t pmd) { phys_addr_t pfn = pmd_val(pmd); pfn ^= protnone_mask(pfn); return (pfn & pmd_pfn_mask(pmd)) >> PAGE_SHIFT; } #define pud_pfn pud_pfn static inline unsigned long pud_pfn(pud_t pud) { phys_addr_t pfn = pud_val(pud); pfn ^= protnone_mask(pfn); return (pfn & pud_pfn_mask(pud)) >> PAGE_SHIFT; } static inline unsigned long p4d_pfn(p4d_t p4d) { return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT; } static inline unsigned long pgd_pfn(pgd_t pgd) { return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT; } #define pte_page(pte) pfn_to_page(pte_pfn(pte)) #define pmd_leaf pmd_leaf static inline bool pmd_leaf(pmd_t pte) { return pmd_flags(pte) & _PAGE_PSE; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline int pmd_trans_huge(pmd_t pmd) { return (pmd_val(pmd) & _PAGE_PSE) == _PAGE_PSE; } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD static inline int pud_trans_huge(pud_t pud) { return (pud_val(pud) & _PAGE_PSE) == _PAGE_PSE; } #endif #define has_transparent_hugepage has_transparent_hugepage static inline int has_transparent_hugepage(void) { return boot_cpu_has(X86_FEATURE_PSE); } #ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP static inline bool pmd_special(pmd_t pmd) { return pmd_flags(pmd) & _PAGE_SPECIAL; } static inline pmd_t pmd_mkspecial(pmd_t pmd) { return pmd_set_flags(pmd, _PAGE_SPECIAL); } #endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */ #ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP static inline bool pud_special(pud_t pud) { return pud_flags(pud) & _PAGE_SPECIAL; } static inline pud_t pud_mkspecial(pud_t pud) { return pud_set_flags(pud, _PAGE_SPECIAL); } #endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline pte_t pte_set_flags(pte_t pte, pteval_t set) { pteval_t v = native_pte_val(pte); return native_make_pte(v | set); } static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear) { pteval_t v = native_pte_val(pte); return native_make_pte(v & ~clear); } /* * Write protection operations can result in Dirty=1,Write=0 PTEs. But in the * case of X86_FEATURE_USER_SHSTK, these PTEs denote shadow stack memory. So * when creating dirty, write-protected memory, a software bit is used: * _PAGE_BIT_SAVED_DIRTY. The following functions take a PTE and transition the * Dirty bit to SavedDirty, and vice-vesra. * * This shifting is only done if needed. In the case of shifting * Dirty->SavedDirty, the condition is if the PTE is Write=0. In the case of * shifting SavedDirty->Dirty, the condition is Write=1. */ static inline pgprotval_t mksaveddirty_shift(pgprotval_t v) { pgprotval_t cond = (~v >> _PAGE_BIT_RW) & 1; v |= ((v >> _PAGE_BIT_DIRTY) & cond) << _PAGE_BIT_SAVED_DIRTY; v &= ~(cond << _PAGE_BIT_DIRTY); return v; } static inline pgprotval_t clear_saveddirty_shift(pgprotval_t v) { pgprotval_t cond = (v >> _PAGE_BIT_RW) & 1; v |= ((v >> _PAGE_BIT_SAVED_DIRTY) & cond) << _PAGE_BIT_DIRTY; v &= ~(cond << _PAGE_BIT_SAVED_DIRTY); return v; } static inline pte_t pte_mksaveddirty(pte_t pte) { pteval_t v = native_pte_val(pte); v = mksaveddirty_shift(v); return native_make_pte(v); } static inline pte_t pte_clear_saveddirty(pte_t pte) { pteval_t v = native_pte_val(pte); v = clear_saveddirty_shift(v); return native_make_pte(v); } static inline pte_t pte_wrprotect(pte_t pte) { pte = pte_clear_flags(pte, _PAGE_RW); /* * Blindly clearing _PAGE_RW might accidentally create * a shadow stack PTE (Write=0,Dirty=1). Move the hardware * dirty value to the software bit, if present. */ return pte_mksaveddirty(pte); } #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP static inline int pte_uffd_wp(pte_t pte) { return pte_flags(pte) & _PAGE_UFFD_WP; } static inline pte_t pte_mkuffd_wp(pte_t pte) { return pte_wrprotect(pte_set_flags(pte, _PAGE_UFFD_WP)); } static inline pte_t pte_clear_uffd_wp(pte_t pte) { return pte_clear_flags(pte, _PAGE_UFFD_WP); } #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ static inline pte_t pte_mkclean(pte_t pte) { return pte_clear_flags(pte, _PAGE_DIRTY_BITS); } static inline pte_t pte_mkold(pte_t pte) { return pte_clear_flags(pte, _PAGE_ACCESSED); } static inline pte_t pte_mkexec(pte_t pte) { return pte_clear_flags(pte, _PAGE_NX); } static inline pte_t pte_mkdirty(pte_t pte) { pte = pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); return pte_mksaveddirty(pte); } static inline pte_t pte_mkwrite_shstk(pte_t pte) { pte = pte_clear_flags(pte, _PAGE_RW); return pte_set_flags(pte, _PAGE_DIRTY); } static inline pte_t pte_mkyoung(pte_t pte) { return pte_set_flags(pte, _PAGE_ACCESSED); } static inline pte_t pte_mkwrite_novma(pte_t pte) { return pte_set_flags(pte, _PAGE_RW); } struct vm_area_struct; pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma); #define pte_mkwrite pte_mkwrite static inline pte_t pte_mkhuge(pte_t pte) { return pte_set_flags(pte, _PAGE_PSE); } static inline pte_t pte_clrhuge(pte_t pte) { return pte_clear_flags(pte, _PAGE_PSE); } static inline pte_t pte_mkglobal(pte_t pte) { return pte_set_flags(pte, _PAGE_GLOBAL); } static inline pte_t pte_clrglobal(pte_t pte) { return pte_clear_flags(pte, _PAGE_GLOBAL); } static inline pte_t pte_mkspecial(pte_t pte) { return pte_set_flags(pte, _PAGE_SPECIAL); } /* See comments above mksaveddirty_shift() */ static inline pmd_t pmd_mksaveddirty(pmd_t pmd) { pmdval_t v = native_pmd_val(pmd); v = mksaveddirty_shift(v); return native_make_pmd(v); } /* See comments above mksaveddirty_shift() */ static inline pmd_t pmd_clear_saveddirty(pmd_t pmd) { pmdval_t v = native_pmd_val(pmd); v = clear_saveddirty_shift(v); return native_make_pmd(v); } static inline pmd_t pmd_wrprotect(pmd_t pmd) { pmd = pmd_clear_flags(pmd, _PAGE_RW); /* * Blindly clearing _PAGE_RW might accidentally create * a shadow stack PMD (RW=0, Dirty=1). Move the hardware * dirty value to the software bit. */ return pmd_mksaveddirty(pmd); } #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP static inline int pmd_uffd_wp(pmd_t pmd) { return pmd_flags(pmd) & _PAGE_UFFD_WP; } static inline pmd_t pmd_mkuffd_wp(pmd_t pmd) { return pmd_wrprotect(pmd_set_flags(pmd, _PAGE_UFFD_WP)); } static inline pmd_t pmd_clear_uffd_wp(pmd_t pmd) { return pmd_clear_flags(pmd, _PAGE_UFFD_WP); } #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ static inline pmd_t pmd_mkold(pmd_t pmd) { return pmd_clear_flags(pmd, _PAGE_ACCESSED); } static inline pmd_t pmd_mkclean(pmd_t pmd) { return pmd_clear_flags(pmd, _PAGE_DIRTY_BITS); } static inline pmd_t pmd_mkdirty(pmd_t pmd) { pmd = pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); return pmd_mksaveddirty(pmd); } static inline pmd_t pmd_mkwrite_shstk(pmd_t pmd) { pmd = pmd_clear_flags(pmd, _PAGE_RW); return pmd_set_flags(pmd, _PAGE_DIRTY); } static inline pmd_t pmd_mkhuge(pmd_t pmd) { return pmd_set_flags(pmd, _PAGE_PSE); } static inline pmd_t pmd_mkyoung(pmd_t pmd) { return pmd_set_flags(pmd, _PAGE_ACCESSED); } static inline pmd_t pmd_mkwrite_novma(pmd_t pmd) { return pmd_set_flags(pmd, _PAGE_RW); } pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); #define pmd_mkwrite pmd_mkwrite /* See comments above mksaveddirty_shift() */ static inline pud_t pud_mksaveddirty(pud_t pud) { pudval_t v = native_pud_val(pud); v = mksaveddirty_shift(v); return native_make_pud(v); } /* See comments above mksaveddirty_shift() */ static inline pud_t pud_clear_saveddirty(pud_t pud) { pudval_t v = native_pud_val(pud); v = clear_saveddirty_shift(v); return native_make_pud(v); } static inline pud_t pud_mkold(pud_t pud) { return pud_clear_flags(pud, _PAGE_ACCESSED); } static inline pud_t pud_mkclean(pud_t pud) { return pud_clear_flags(pud, _PAGE_DIRTY_BITS); } static inline pud_t pud_wrprotect(pud_t pud) { pud = pud_clear_flags(pud, _PAGE_RW); /* * Blindly clearing _PAGE_RW might accidentally create * a shadow stack PUD (RW=0, Dirty=1). Move the hardware * dirty value to the software bit. */ return pud_mksaveddirty(pud); } static inline pud_t pud_mkdirty(pud_t pud) { pud = pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); return pud_mksaveddirty(pud); } static inline pud_t pud_mkhuge(pud_t pud) { return pud_set_flags(pud, _PAGE_PSE); } static inline pud_t pud_mkyoung(pud_t pud) { return pud_set_flags(pud, _PAGE_ACCESSED); } static inline pud_t pud_mkwrite(pud_t pud) { pud = pud_set_flags(pud, _PAGE_RW); return pud_clear_saveddirty(pud); } #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline int pte_soft_dirty(pte_t pte) { return pte_flags(pte) & _PAGE_SOFT_DIRTY; } static inline int pmd_soft_dirty(pmd_t pmd) { return pmd_flags(pmd) & _PAGE_SOFT_DIRTY; } static inline int pud_soft_dirty(pud_t pud) { return pud_flags(pud) & _PAGE_SOFT_DIRTY; } static inline pte_t pte_mksoft_dirty(pte_t pte) { return pte_set_flags(pte, _PAGE_SOFT_DIRTY); } static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) { return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); } static inline pud_t pud_mksoft_dirty(pud_t pud) { return pud_set_flags(pud, _PAGE_SOFT_DIRTY); } static inline pte_t pte_clear_soft_dirty(pte_t pte) { return pte_clear_flags(pte, _PAGE_SOFT_DIRTY); } static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd) { return pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY); } static inline pud_t pud_clear_soft_dirty(pud_t pud) { return pud_clear_flags(pud, _PAGE_SOFT_DIRTY); } #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ /* * Mask out unsupported bits in a present pgprot. Non-present pgprots * can use those bits for other purposes, so leave them be. */ static inline pgprotval_t massage_pgprot(pgprot_t pgprot) { pgprotval_t protval = pgprot_val(pgprot); if (protval & _PAGE_PRESENT) protval &= __supported_pte_mask; return protval; } static inline pgprotval_t check_pgprot(pgprot_t pgprot) { pgprotval_t massaged_val = massage_pgprot(pgprot); /* mmdebug.h can not be included here because of dependencies */ #ifdef CONFIG_DEBUG_VM WARN_ONCE(pgprot_val(pgprot) != massaged_val, "attempted to set unsupported pgprot: %016llx " "bits: %016llx supported: %016llx\n", (u64)pgprot_val(pgprot), (u64)pgprot_val(pgprot) ^ massaged_val, (u64)__supported_pte_mask); #endif return massaged_val; } static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) { phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT; /* This bit combination is used to mark shadow stacks */ WARN_ON_ONCE((pgprot_val(pgprot) & (_PAGE_DIRTY | _PAGE_RW)) == _PAGE_DIRTY); pfn ^= protnone_mask(pgprot_val(pgprot)); pfn &= PTE_PFN_MASK; return __pte(pfn | check_pgprot(pgprot)); } static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) { phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT; pfn ^= protnone_mask(pgprot_val(pgprot)); pfn &= PHYSICAL_PMD_PAGE_MASK; return __pmd(pfn | check_pgprot(pgprot)); } static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot) { phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT; pfn ^= protnone_mask(pgprot_val(pgprot)); pfn &= PHYSICAL_PUD_PAGE_MASK; return __pud(pfn | check_pgprot(pgprot)); } static inline pmd_t pmd_mkinvalid(pmd_t pmd) { return pfn_pmd(pmd_pfn(pmd), __pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE))); } static inline pud_t pud_mkinvalid(pud_t pud) { return pfn_pud(pud_pfn(pud), __pgprot(pud_flags(pud) & ~(_PAGE_PRESENT|_PAGE_PROTNONE))); } static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask); static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { pteval_t val = pte_val(pte), oldval = val; pte_t pte_result; /* * Chop off the NX bit (if present), and add the NX portion of * the newprot (if present): */ val &= _PAGE_CHG_MASK; val |= check_pgprot(newprot) & ~_PAGE_CHG_MASK; val = flip_protnone_guard(oldval, val, PTE_PFN_MASK); pte_result = __pte(val); /* * To avoid creating Write=0,Dirty=1 PTEs, pte_modify() needs to avoid: * 1. Marking Write=0 PTEs Dirty=1 * 2. Marking Dirty=1 PTEs Write=0 * * The first case cannot happen because the _PAGE_CHG_MASK will filter * out any Dirty bit passed in newprot. Handle the second case by * going through the mksaveddirty exercise. Only do this if the old * value was Write=1 to avoid doing this on Shadow Stack PTEs. */ if (oldval & _PAGE_RW) pte_result = pte_mksaveddirty(pte_result); else pte_result = pte_clear_saveddirty(pte_result); return pte_result; } static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) { pmdval_t val = pmd_val(pmd), oldval = val; pmd_t pmd_result; val &= (_HPAGE_CHG_MASK & ~_PAGE_DIRTY); val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK; val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK); pmd_result = __pmd(val); /* * Avoid creating shadow stack PMD by accident. See comment in * pte_modify(). */ if (oldval & _PAGE_RW) pmd_result = pmd_mksaveddirty(pmd_result); else pmd_result = pmd_clear_saveddirty(pmd_result); return pmd_result; } static inline pud_t pud_modify(pud_t pud, pgprot_t newprot) { pudval_t val = pud_val(pud), oldval = val; pud_t pud_result; val &= _HPAGE_CHG_MASK; val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK; val = flip_protnone_guard(oldval, val, PHYSICAL_PUD_PAGE_MASK); pud_result = __pud(val); /* * Avoid creating shadow stack PUD by accident. See comment in * pte_modify(). */ if (oldval & _PAGE_RW) pud_result = pud_mksaveddirty(pud_result); else pud_result = pud_clear_saveddirty(pud_result); return pud_result; } /* * mprotect needs to preserve PAT and encryption bits when updating * vm_page_prot */ #define pgprot_modify pgprot_modify static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) { pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK; pgprotval_t addbits = pgprot_val(newprot) & ~_PAGE_CHG_MASK; return __pgprot(preservebits | addbits); } #define pte_pgprot(x) __pgprot(pte_flags(x)) #define pmd_pgprot(x) __pgprot(pmd_flags(x)) #define pud_pgprot(x) __pgprot(pud_flags(x)) #define p4d_pgprot(x) __pgprot(p4d_flags(x)) #define canon_pgprot(p) __pgprot(massage_pgprot(p)) static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, enum page_cache_mode pcm, enum page_cache_mode new_pcm) { /* * PAT type is always WB for untracked ranges, so no need to check. */ if (x86_platform.is_untracked_pat_range(paddr, paddr + size)) return 1; /* * Certain new memtypes are not allowed with certain * requested memtype: * - request is uncached, return cannot be write-back * - request is write-combine, return cannot be write-back * - request is write-through, return cannot be write-back * - request is write-through, return cannot be write-combine */ if ((pcm == _PAGE_CACHE_MODE_UC_MINUS && new_pcm == _PAGE_CACHE_MODE_WB) || (pcm == _PAGE_CACHE_MODE_WC && new_pcm == _PAGE_CACHE_MODE_WB) || (pcm == _PAGE_CACHE_MODE_WT && new_pcm == _PAGE_CACHE_MODE_WB) || (pcm == _PAGE_CACHE_MODE_WT && new_pcm == _PAGE_CACHE_MODE_WC)) { return 0; } return 1; } pmd_t *populate_extra_pmd(unsigned long vaddr); pte_t *populate_extra_pte(unsigned long vaddr); #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd); /* * Take a PGD location (pgdp) and a pgd value that needs to be set there. * Populates the user and returns the resulting PGD that must be set in * the kernel copy of the page tables. */ static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) { if (!static_cpu_has(X86_FEATURE_PTI)) return pgd; return __pti_set_user_pgtbl(pgdp, pgd); } #else /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) { return pgd; } #endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ #endif /* __ASSEMBLER__ */ #ifdef CONFIG_X86_32 # include <asm/pgtable_32.h> #else # include <asm/pgtable_64.h> #endif #ifndef __ASSEMBLER__ #include <linux/mm_types.h> #include <linux/mmdebug.h> #include <linux/log2.h> #include <asm/fixmap.h> static inline int pte_none(pte_t pte) { return !(pte.pte & ~(_PAGE_KNL_ERRATUM_MASK)); } #define __HAVE_ARCH_PTE_SAME static inline int pte_same(pte_t a, pte_t b) { return a.pte == b.pte; } static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) { if (__pte_needs_invert(pte_val(pte))) return __pte(pte_val(pte) - (nr << PFN_PTE_SHIFT)); return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT)); } #define pte_advance_pfn pte_advance_pfn static inline int pte_present(pte_t a) { return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); } #define pte_accessible pte_accessible static inline bool pte_accessible(struct mm_struct *mm, pte_t a) { if (pte_flags(a) & _PAGE_PRESENT) return true; if ((pte_flags(a) & _PAGE_PROTNONE) && atomic_read(&mm->tlb_flush_pending)) return true; return false; } static inline int pmd_present(pmd_t pmd) { /* * Checking for _PAGE_PSE is needed too because * split_huge_page will temporarily clear the present bit (but * the _PAGE_PSE flag will remain set at all times while the * _PAGE_PRESENT bit is clear). */ return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE); } #ifdef CONFIG_NUMA_BALANCING /* * These work without NUMA balancing but the kernel does not care. See the * comment in include/linux/pgtable.h */ static inline int pte_protnone(pte_t pte) { return (pte_flags(pte) & (_PAGE_PROTNONE | _PAGE_PRESENT)) == _PAGE_PROTNONE; } static inline int pmd_protnone(pmd_t pmd) { return (pmd_flags(pmd) & (_PAGE_PROTNONE | _PAGE_PRESENT)) == _PAGE_PROTNONE; } #endif /* CONFIG_NUMA_BALANCING */ static inline int pmd_none(pmd_t pmd) { /* Only check low word on 32-bit platforms, since it might be out of sync with upper half. */ unsigned long val = native_pmd_val(pmd); return (val & ~_PAGE_KNL_ERRATUM_MASK) == 0; } static inline unsigned long pmd_page_vaddr(pmd_t pmd) { return (unsigned long)__va(pmd_val(pmd) & pmd_pfn_mask(pmd)); } /* * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ #define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd)) static inline int pmd_bad(pmd_t pmd) { return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) != (_KERNPG_TABLE & ~_PAGE_ACCESSED); } static inline unsigned long pages_to_mb(unsigned long npg) { return npg >> (20 - PAGE_SHIFT); } #if CONFIG_PGTABLE_LEVELS > 2 static inline int pud_none(pud_t pud) { return (native_pud_val(pud) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0; } static inline int pud_present(pud_t pud) { return pud_flags(pud) & _PAGE_PRESENT; } static inline pmd_t *pud_pgtable(pud_t pud) { return (pmd_t *)__va(pud_val(pud) & pud_pfn_mask(pud)); } /* * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ #define pud_page(pud) pfn_to_page(pud_pfn(pud)) #define pud_leaf pud_leaf static inline bool pud_leaf(pud_t pud) { return pud_val(pud) & _PAGE_PSE; } static inline int pud_bad(pud_t pud) { return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0; } #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #if CONFIG_PGTABLE_LEVELS > 3 static inline int p4d_none(p4d_t p4d) { return (native_p4d_val(p4d) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0; } static inline int p4d_present(p4d_t p4d) { return p4d_flags(p4d) & _PAGE_PRESENT; } static inline pud_t *p4d_pgtable(p4d_t p4d) { return (pud_t *)__va(p4d_val(p4d) & p4d_pfn_mask(p4d)); } /* * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ #define p4d_page(p4d) pfn_to_page(p4d_pfn(p4d)) static inline int p4d_bad(p4d_t p4d) { unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER; if (IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) ignore_flags |= _PAGE_NX; return (p4d_flags(p4d) & ~ignore_flags) != 0; } #endif /* CONFIG_PGTABLE_LEVELS > 3 */ static inline unsigned long p4d_index(unsigned long address) { return (address >> P4D_SHIFT) & (PTRS_PER_P4D - 1); } #if CONFIG_PGTABLE_LEVELS > 4 static inline int pgd_present(pgd_t pgd) { if (!pgtable_l5_enabled()) return 1; return pgd_flags(pgd) & _PAGE_PRESENT; } static inline unsigned long pgd_page_vaddr(pgd_t pgd) { return (unsigned long)__va((unsigned long)pgd_val(pgd) & PTE_PFN_MASK); } /* * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ #define pgd_page(pgd) pfn_to_page(pgd_pfn(pgd)) /* to find an entry in a page-table-directory. */ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) { if (!pgtable_l5_enabled()) return (p4d_t *)pgd; return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address); } static inline int pgd_bad(pgd_t pgd) { unsigned long ignore_flags = _PAGE_USER; if (!pgtable_l5_enabled()) return 0; if (IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) ignore_flags |= _PAGE_NX; return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; } static inline int pgd_none(pgd_t pgd) { if (!pgtable_l5_enabled()) return 0; /* * There is no need to do a workaround for the KNL stray * A/D bit erratum here. PGDs only point to page tables * except on 32-bit non-PAE which is not supported on * KNL. */ return !native_pgd_val(pgd); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* __ASSEMBLER__ */ #define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET) #define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY) #ifndef __ASSEMBLER__ extern int direct_gbpages; void init_mem_mapping(void); void early_alloc_pgt_buf(void); void __init poking_init(void); unsigned long init_memory_mapping(unsigned long start, unsigned long end, pgprot_t prot); #ifdef CONFIG_X86_64 extern pgd_t trampoline_pgd_entry; #endif /* local pte updates need not use xchg for locking */ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) { pte_t res = *ptep; /* Pure native function needs no input for mm, addr */ native_pte_clear(NULL, 0, ptep); return res; } static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp) { pmd_t res = *pmdp; native_pmd_clear(pmdp); return res; } static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp) { pud_t res = *pudp; native_pud_clear(pudp); return res; } static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { page_table_check_pmd_set(mm, pmdp, pmd); set_pmd(pmdp, pmd); } static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { page_table_check_pud_set(mm, pudp, pud); native_set_pud(pudp, pud); } /* * We only update the dirty/accessed state if we set * the dirty bit by hand in the kernel, since the hardware * will do the accessed bit for us, and we don't want to * race with other CPU's that might be updating the dirty * bit at the same time. */ struct vm_area_struct; #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty); #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG extern int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH extern int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); #define __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte = native_ptep_get_and_clear(ptep); page_table_check_pte_clear(mm, pte); return pte; } #define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full) { pte_t pte; if (full) { /* * Full address destruction in progress; paravirt does not * care about updates and native needs no locking */ pte = native_local_ptep_get_and_clear(ptep); page_table_check_pte_clear(mm, pte); } else { pte = ptep_get_and_clear(mm, addr, ptep); } return pte; } #define __HAVE_ARCH_PTEP_SET_WRPROTECT static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { /* * Avoid accidentally creating shadow stack PTEs * (Write=0,Dirty=1). Use cmpxchg() to prevent races with * the hardware setting Dirty=1. */ pte_t old_pte, new_pte; old_pte = READ_ONCE(*ptep); do { new_pte = pte_wrprotect(old_pte); } while (!try_cmpxchg((long *)&ptep->pte, (long *)&old_pte, *(long *)&new_pte)); } #define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0) #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS extern int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty); extern int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t entry, int dirty); #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp); extern int pudp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp); #define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH extern int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { pmd_t pmd = native_pmdp_get_and_clear(pmdp); page_table_check_pmd_clear(mm, pmd); return pmd; } #define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pud_t *pudp) { pud_t pud = native_pudp_get_and_clear(pudp); page_table_check_pud_clear(mm, pud); return pud; } #define __HAVE_ARCH_PMDP_SET_WRPROTECT static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { /* * Avoid accidentally creating shadow stack PTEs * (Write=0,Dirty=1). Use cmpxchg() to prevent races with * the hardware setting Dirty=1. */ pmd_t old_pmd, new_pmd; old_pmd = READ_ONCE(*pmdp); do { new_pmd = pmd_wrprotect(old_pmd); } while (!try_cmpxchg((long *)pmdp, (long *)&old_pmd, *(long *)&new_pmd)); } #ifndef pmdp_establish #define pmdp_establish pmdp_establish static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { page_table_check_pmd_set(vma->vm_mm, pmdp, pmd); if (IS_ENABLED(CONFIG_SMP)) { return xchg(pmdp, pmd); } else { pmd_t old = *pmdp; WRITE_ONCE(*pmdp, pmd); return old; } } #endif #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD static inline pud_t pudp_establish(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t pud) { page_table_check_pud_set(vma->vm_mm, pudp, pud); if (IS_ENABLED(CONFIG_SMP)) { return xchg(pudp, pud); } else { pud_t old = *pudp; WRITE_ONCE(*pudp, pud); return old; } } #endif #define __HAVE_ARCH_PMDP_INVALIDATE_AD extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, pud_t *pudp); /* * Page table pages are page-aligned. The lower half of the top * level is used for userspace and the top half for the kernel. * * Returns true for parts of the PGD that map userspace and * false for the parts that map the kernel. */ static inline bool pgdp_maps_userspace(void *__ptr) { unsigned long ptr = (unsigned long)__ptr; return (((ptr & ~PAGE_MASK) / sizeof(pgd_t)) < PGD_KERNEL_START); } #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* * All top-level MITIGATION_PAGE_TABLE_ISOLATION page tables are order-1 pages * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and * the user one is in the last 4k. To switch between them, you * just need to flip the 12th bit in their addresses. */ #define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT /* * This generates better code than the inline assembly in * __set_bit(). */ static inline void *ptr_set_bit(void *ptr, int bit) { unsigned long __ptr = (unsigned long)ptr; __ptr |= BIT(bit); return (void *)__ptr; } static inline void *ptr_clear_bit(void *ptr, int bit) { unsigned long __ptr = (unsigned long)ptr; __ptr &= ~BIT(bit); return (void *)__ptr; } static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp) { return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); } static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp) { return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); } static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp) { return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); } static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) { return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); } #endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ /* * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); * * dst - pointer to pgd range anywhere on a pgd page * src - "" * count - the number of pgds to copy. * * dst and src can be on the same page, but the range must not overlap, * and must not cross a page boundary. */ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) { memcpy(dst, src, count * sizeof(pgd_t)); #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION if (!static_cpu_has(X86_FEATURE_PTI)) return; /* Clone the user space pgd as well */ memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src), count * sizeof(pgd_t)); #endif } #define PTE_SHIFT ilog2(PTRS_PER_PTE) static inline int page_level_shift(enum pg_level level) { return (PAGE_SHIFT - PTE_SHIFT) + level * PTE_SHIFT; } static inline unsigned long page_level_size(enum pg_level level) { return 1UL << page_level_shift(level); } static inline unsigned long page_level_mask(enum pg_level level) { return ~(page_level_size(level) - 1); } /* * The x86 doesn't have any external MMU info: the kernel page * tables contain all the necessary information. */ static inline void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { } static inline void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr) { } static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd) { } static inline void update_mmu_cache_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud) { } static inline pte_t pte_swp_mkexclusive(pte_t pte) { return pte_set_flags(pte, _PAGE_SWP_EXCLUSIVE); } static inline bool pte_swp_exclusive(pte_t pte) { return pte_flags(pte) & _PAGE_SWP_EXCLUSIVE; } static inline pte_t pte_swp_clear_exclusive(pte_t pte) { return pte_clear_flags(pte, _PAGE_SWP_EXCLUSIVE); } #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline pte_t pte_swp_mksoft_dirty(pte_t pte) { return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); } static inline int pte_swp_soft_dirty(pte_t pte) { return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; } static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) { return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); } #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) { return pmd_set_flags(pmd, _PAGE_SWP_SOFT_DIRTY); } static inline int pmd_swp_soft_dirty(pmd_t pmd) { return pmd_flags(pmd) & _PAGE_SWP_SOFT_DIRTY; } static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) { return pmd_clear_flags(pmd, _PAGE_SWP_SOFT_DIRTY); } #endif #endif #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP static inline pte_t pte_swp_mkuffd_wp(pte_t pte) { return pte_set_flags(pte, _PAGE_SWP_UFFD_WP); } static inline int pte_swp_uffd_wp(pte_t pte) { return pte_flags(pte) & _PAGE_SWP_UFFD_WP; } static inline pte_t pte_swp_clear_uffd_wp(pte_t pte) { return pte_clear_flags(pte, _PAGE_SWP_UFFD_WP); } static inline pmd_t pmd_swp_mkuffd_wp(pmd_t pmd) { return pmd_set_flags(pmd, _PAGE_SWP_UFFD_WP); } static inline int pmd_swp_uffd_wp(pmd_t pmd) { return pmd_flags(pmd) & _PAGE_SWP_UFFD_WP; } static inline pmd_t pmd_swp_clear_uffd_wp(pmd_t pmd) { return pmd_clear_flags(pmd, _PAGE_SWP_UFFD_WP); } #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ static inline u16 pte_flags_pkey(unsigned long pte_flags) { #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS /* ifdef to avoid doing 59-bit shift on 32-bit values */ return (pte_flags & _PAGE_PKEY_MASK) >> _PAGE_BIT_PKEY_BIT0; #else return 0; #endif } static inline bool __pkru_allows_pkey(u16 pkey, bool write) { u32 pkru = read_pkru(); if (!__pkru_allows_read(pkru, pkey)) return false; if (write && !__pkru_allows_write(pkru, pkey)) return false; return true; } /* * 'pteval' can come from a PTE, PMD or PUD. We only check * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the * same value on all 3 types. */ static inline bool __pte_access_permitted(unsigned long pteval, bool write) { unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER; /* * Write=0,Dirty=1 PTEs are shadow stack, which the kernel * shouldn't generally allow access to, but since they * are already Write=0, the below logic covers both cases. */ if (write) need_pte_bits |= _PAGE_RW; if ((pteval & need_pte_bits) != need_pte_bits) return 0; return __pkru_allows_pkey(pte_flags_pkey(pteval), write); } #define pte_access_permitted pte_access_permitted static inline bool pte_access_permitted(pte_t pte, bool write) { return __pte_access_permitted(pte_val(pte), write); } #define pmd_access_permitted pmd_access_permitted static inline bool pmd_access_permitted(pmd_t pmd, bool write) { return __pte_access_permitted(pmd_val(pmd), write); } #define pud_access_permitted pud_access_permitted static inline bool pud_access_permitted(pud_t pud, bool write) { return __pte_access_permitted(pud_val(pud), write); } #define __HAVE_ARCH_PFN_MODIFY_ALLOWED 1 extern bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot); static inline bool arch_has_pfn_modify_check(void) { return boot_cpu_has_bug(X86_BUG_L1TF); } #define arch_check_zapped_pte arch_check_zapped_pte void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte); #define arch_check_zapped_pmd arch_check_zapped_pmd void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd); #define arch_check_zapped_pud arch_check_zapped_pud void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud); #ifdef CONFIG_XEN_PV #define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young static inline bool arch_has_hw_nonleaf_pmd_young(void) { return !cpu_feature_enabled(X86_FEATURE_XENPV); } #endif #ifdef CONFIG_PAGE_TABLE_CHECK static inline bool pte_user_accessible_page(pte_t pte) { return (pte_val(pte) & _PAGE_PRESENT) && (pte_val(pte) & _PAGE_USER); } static inline bool pmd_user_accessible_page(pmd_t pmd) { return pmd_leaf(pmd) && (pmd_val(pmd) & _PAGE_PRESENT) && (pmd_val(pmd) & _PAGE_USER); } static inline bool pud_user_accessible_page(pud_t pud) { return pud_leaf(pud) && (pud_val(pud) & _PAGE_PRESENT) && (pud_val(pud) & _PAGE_USER); } #endif #ifdef CONFIG_X86_SGX int arch_memory_failure(unsigned long pfn, int flags); #define arch_memory_failure arch_memory_failure bool arch_is_platform_page(u64 paddr); #define arch_is_platform_page arch_is_platform_page #endif /* * Use set_p*_safe(), and elide TLB flushing, when confident that *no* * TLB flush will be required as a result of the "set". For example, use * in scenarios where it is known ahead of time that the routine is * setting non-present entries, or re-setting an existing entry to the * same value. Otherwise, use the typical "set" helpers and flush the * TLB. */ #define set_pte_safe(ptep, pte) \ ({ \ WARN_ON_ONCE(pte_present(*ptep) && !pte_same(*ptep, pte)); \ set_pte(ptep, pte); \ }) #define set_pmd_safe(pmdp, pmd) \ ({ \ WARN_ON_ONCE(pmd_present(*pmdp) && !pmd_same(*pmdp, pmd)); \ set_pmd(pmdp, pmd); \ }) #define set_pud_safe(pudp, pud) \ ({ \ WARN_ON_ONCE(pud_present(*pudp) && !pud_same(*pudp, pud)); \ set_pud(pudp, pud); \ }) #define set_p4d_safe(p4dp, p4d) \ ({ \ WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \ set_p4d(p4dp, p4d); \ }) #define set_pgd_safe(pgdp, pgd) \ ({ \ WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \ set_pgd(pgdp, pgd); \ }) #endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_PGTABLE_H */
22 3 3 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 /* SPDX-License-Identifier: GPL-2.0 */ /* * * V 4 L 2 D R I V E R H E L P E R A P I * * Moved from videodev2.h * * Some commonly needed functions for drivers (v4l2-common.o module) */ #ifndef _V4L2_DEV_H #define _V4L2_DEV_H #include <linux/poll.h> #include <linux/fs.h> #include <linux/device.h> #include <linux/cdev.h> #include <linux/mutex.h> #include <linux/videodev2.h> #include <media/media-entity.h> #define VIDEO_MAJOR 81 /** * enum vfl_devnode_type - type of V4L2 device node * * @VFL_TYPE_VIDEO: for video input/output devices * @VFL_TYPE_VBI: for vertical blank data (i.e. closed captions, teletext) * @VFL_TYPE_RADIO: for radio tuners * @VFL_TYPE_SUBDEV: for V4L2 subdevices * @VFL_TYPE_SDR: for Software Defined Radio tuners * @VFL_TYPE_TOUCH: for touch sensors * @VFL_TYPE_MAX: number of VFL types, must always be last in the enum */ enum vfl_devnode_type { VFL_TYPE_VIDEO, VFL_TYPE_VBI, VFL_TYPE_RADIO, VFL_TYPE_SUBDEV, VFL_TYPE_SDR, VFL_TYPE_TOUCH, VFL_TYPE_MAX /* Shall be the last one */ }; /** * enum vfl_devnode_direction - Identifies if a &struct video_device * corresponds to a receiver, a transmitter or a mem-to-mem device. * * @VFL_DIR_RX: device is a receiver. * @VFL_DIR_TX: device is a transmitter. * @VFL_DIR_M2M: device is a memory to memory device. * * Note: Ignored if &enum vfl_devnode_type is %VFL_TYPE_SUBDEV. */ enum vfl_devnode_direction { VFL_DIR_RX, VFL_DIR_TX, VFL_DIR_M2M, }; struct v4l2_ioctl_callbacks; struct video_device; struct v4l2_device; struct v4l2_ctrl_handler; struct dentry; /** * enum v4l2_video_device_flags - Flags used by &struct video_device * * @V4L2_FL_REGISTERED: * indicates that a &struct video_device is registered. * Drivers can clear this flag if they want to block all future * device access. It is cleared by video_unregister_device. * @V4L2_FL_USES_V4L2_FH: * indicates that file->private_data points to &struct v4l2_fh. * This flag is set by the core when v4l2_fh_init() is called. * All drivers must use it. * @V4L2_FL_QUIRK_INVERTED_CROP: * some old M2M drivers use g/s_crop/cropcap incorrectly: crop and * compose are swapped. If this flag is set, then the selection * targets are swapped in the g/s_crop/cropcap functions in v4l2-ioctl.c. * This allows those drivers to correctly implement the selection API, * but the old crop API will still work as expected in order to preserve * backwards compatibility. * Never set this flag for new drivers. * @V4L2_FL_SUBDEV_RO_DEVNODE: * indicates that the video device node is registered in read-only mode. * The flag only applies to device nodes registered for sub-devices, it is * set by the core when the sub-devices device nodes are registered with * v4l2_device_register_ro_subdev_nodes() and used by the sub-device ioctl * handler to restrict access to some ioctl calls. */ enum v4l2_video_device_flags { V4L2_FL_REGISTERED = 0, V4L2_FL_USES_V4L2_FH = 1, V4L2_FL_QUIRK_INVERTED_CROP = 2, V4L2_FL_SUBDEV_RO_DEVNODE = 3, }; /* Priority helper functions */ /** * struct v4l2_prio_state - stores the priority states * * @prios: array with elements to store the array priorities * * * .. note:: * The size of @prios array matches the number of priority types defined * by enum &v4l2_priority. */ struct v4l2_prio_state { atomic_t prios[4]; }; /** * v4l2_prio_init - initializes a struct v4l2_prio_state * * @global: pointer to &struct v4l2_prio_state */ void v4l2_prio_init(struct v4l2_prio_state *global); /** * v4l2_prio_change - changes the v4l2 file handler priority * * @global: pointer to the &struct v4l2_prio_state of the device node. * @local: pointer to the desired priority, as defined by enum &v4l2_priority * @new: Priority type requested, as defined by enum &v4l2_priority. * * .. note:: * This function should be used only by the V4L2 core. */ int v4l2_prio_change(struct v4l2_prio_state *global, enum v4l2_priority *local, enum v4l2_priority new); /** * v4l2_prio_open - Implements the priority logic for a file handler open * * @global: pointer to the &struct v4l2_prio_state of the device node. * @local: pointer to the desired priority, as defined by enum &v4l2_priority * * .. note:: * This function should be used only by the V4L2 core. */ void v4l2_prio_open(struct v4l2_prio_state *global, enum v4l2_priority *local); /** * v4l2_prio_close - Implements the priority logic for a file handler close * * @global: pointer to the &struct v4l2_prio_state of the device node. * @local: priority to be released, as defined by enum &v4l2_priority * * .. note:: * This function should be used only by the V4L2 core. */ void v4l2_prio_close(struct v4l2_prio_state *global, enum v4l2_priority local); /** * v4l2_prio_max - Return the maximum priority, as stored at the @global array. * * @global: pointer to the &struct v4l2_prio_state of the device node. * * .. note:: * This function should be used only by the V4L2 core. */ enum v4l2_priority v4l2_prio_max(struct v4l2_prio_state *global); /** * v4l2_prio_check - Implements the priority logic for a file handler close * * @global: pointer to the &struct v4l2_prio_state of the device node. * @local: desired priority, as defined by enum &v4l2_priority local * * .. note:: * This function should be used only by the V4L2 core. */ int v4l2_prio_check(struct v4l2_prio_state *global, enum v4l2_priority local); /** * struct v4l2_file_operations - fs operations used by a V4L2 device * * @owner: pointer to struct module * @read: operations needed to implement the read() syscall * @write: operations needed to implement the write() syscall * @poll: operations needed to implement the poll() syscall * @unlocked_ioctl: operations needed to implement the ioctl() syscall * @compat_ioctl32: operations needed to implement the ioctl() syscall for * the special case where the Kernel uses 64 bits instructions, but * the userspace uses 32 bits. * @get_unmapped_area: called by the mmap() syscall, used when %!CONFIG_MMU * @mmap: operations needed to implement the mmap() syscall * @open: operations needed to implement the open() syscall * @release: operations needed to implement the release() syscall * * .. note:: * * Those operations are used to implemente the fs struct file_operations * at the V4L2 drivers. The V4L2 core overrides the fs ops with some * extra logic needed by the subsystem. */ struct v4l2_file_operations { struct module *owner; ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); __poll_t (*poll) (struct file *, struct poll_table_struct *); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); #ifdef CONFIG_COMPAT long (*compat_ioctl32) (struct file *, unsigned int, unsigned long); #endif unsigned long (*get_unmapped_area) (struct file *, unsigned long, unsigned long, unsigned long, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); int (*open) (struct file *); int (*release) (struct file *); }; /* * Newer version of video_device, handled by videodev2.c * This version moves redundant code from video device code to * the common handler */ /** * struct video_device - Structure used to create and manage the V4L2 device * nodes. * * @entity: &struct media_entity * @intf_devnode: pointer to &struct media_intf_devnode * @pipe: &struct media_pipeline * @fops: pointer to &struct v4l2_file_operations for the video device * @device_caps: device capabilities as used in v4l2_capabilities * @dev: &struct device for the video device * @cdev: character device * @v4l2_dev: pointer to &struct v4l2_device parent * @dev_parent: pointer to &struct device parent * @ctrl_handler: Control handler associated with this device node. * May be NULL. * @queue: &struct vb2_queue associated with this device node. May be NULL. * @prio: pointer to &struct v4l2_prio_state with device's Priority state. * If NULL, then v4l2_dev->prio will be used. * @name: video device name * @vfl_type: V4L device type, as defined by &enum vfl_devnode_type * @vfl_dir: V4L receiver, transmitter or m2m * @minor: device node 'minor'. It is set to -1 if the registration failed * @num: number of the video device node * @flags: video device flags. Use bitops to set/clear/test flags. * Contains a set of &enum v4l2_video_device_flags. * @index: attribute to differentiate multiple indices on one physical device * @fh_lock: Lock for all v4l2_fhs * @fh_list: List of &struct v4l2_fh * @dev_debug: Internal device debug flags, not for use by drivers * @tvnorms: Supported tv norms * * @release: video device release() callback * @ioctl_ops: pointer to &struct v4l2_ioctl_ops with ioctl callbacks * * @valid_ioctls: bitmap with the valid ioctls for this device * @lock: pointer to &struct mutex serialization lock * * .. note:: * Only set @dev_parent if that can't be deduced from @v4l2_dev. */ struct video_device { #if defined(CONFIG_MEDIA_CONTROLLER) struct media_entity entity; struct media_intf_devnode *intf_devnode; struct media_pipeline pipe; #endif const struct v4l2_file_operations *fops; u32 device_caps; /* sysfs */ struct device dev; struct cdev *cdev; struct v4l2_device *v4l2_dev; struct device *dev_parent; struct v4l2_ctrl_handler *ctrl_handler; struct vb2_queue *queue; struct v4l2_prio_state *prio; /* device info */ char name[64]; enum vfl_devnode_type vfl_type; enum vfl_devnode_direction vfl_dir; int minor; u16 num; unsigned long flags; int index; /* V4L2 file handles */ spinlock_t fh_lock; struct list_head fh_list; int dev_debug; v4l2_std_id tvnorms; /* callbacks */ void (*release)(struct video_device *vdev); const struct v4l2_ioctl_ops *ioctl_ops; DECLARE_BITMAP(valid_ioctls, BASE_VIDIOC_PRIVATE); struct mutex *lock; }; /** * media_entity_to_video_device - Returns a &struct video_device from * the &struct media_entity embedded on it. * * @__entity: pointer to &struct media_entity, may be NULL */ #define media_entity_to_video_device(__entity) \ ({ \ typeof(__entity) __me_vdev_ent = __entity; \ \ __me_vdev_ent ? \ container_of(__me_vdev_ent, struct video_device, entity) : \ NULL; \ }) /** * to_video_device - Returns a &struct video_device from the * &struct device embedded on it. * * @cd: pointer to &struct device */ #define to_video_device(cd) container_of(cd, struct video_device, dev) /** * __video_register_device - register video4linux devices * * @vdev: struct video_device to register * @type: type of device to register, as defined by &enum vfl_devnode_type * @nr: which device node number is desired: * (0 == /dev/video0, 1 == /dev/video1, ..., -1 == first free) * @warn_if_nr_in_use: warn if the desired device node number * was already in use and another number was chosen instead. * @owner: module that owns the video device node * * The registration code assigns minor numbers and device node numbers * based on the requested type and registers the new device node with * the kernel. * * This function assumes that struct video_device was zeroed when it * was allocated and does not contain any stale date. * * An error is returned if no free minor or device node number could be * found, or if the registration of the device node failed. * * Returns 0 on success. * * .. note:: * * This function is meant to be used only inside the V4L2 core. * Drivers should use video_register_device() or * video_register_device_no_warn(). */ int __must_check __video_register_device(struct video_device *vdev, enum vfl_devnode_type type, int nr, int warn_if_nr_in_use, struct module *owner); /** * video_register_device - register video4linux devices * * @vdev: struct video_device to register * @type: type of device to register, as defined by &enum vfl_devnode_type * @nr: which device node number is desired: * (0 == /dev/video0, 1 == /dev/video1, ..., -1 == first free) * * Internally, it calls __video_register_device(). Please see its * documentation for more details. * * .. note:: * if video_register_device fails, the release() callback of * &struct video_device structure is *not* called, so the caller * is responsible for freeing any data. Usually that means that * you video_device_release() should be called on failure. */ static inline int __must_check video_register_device(struct video_device *vdev, enum vfl_devnode_type type, int nr) { return __video_register_device(vdev, type, nr, 1, vdev->fops->owner); } /** * video_register_device_no_warn - register video4linux devices * * @vdev: struct video_device to register * @type: type of device to register, as defined by &enum vfl_devnode_type * @nr: which device node number is desired: * (0 == /dev/video0, 1 == /dev/video1, ..., -1 == first free) * * This function is identical to video_register_device() except that no * warning is issued if the desired device node number was already in use. * * Internally, it calls __video_register_device(). Please see its * documentation for more details. * * .. note:: * if video_register_device fails, the release() callback of * &struct video_device structure is *not* called, so the caller * is responsible for freeing any data. Usually that means that * you video_device_release() should be called on failure. */ static inline int __must_check video_register_device_no_warn(struct video_device *vdev, enum vfl_devnode_type type, int nr) { return __video_register_device(vdev, type, nr, 0, vdev->fops->owner); } /** * video_unregister_device - Unregister video devices. * * @vdev: &struct video_device to register * * Does nothing if vdev == NULL or if video_is_registered() returns false. */ void video_unregister_device(struct video_device *vdev); /** * video_device_alloc - helper function to alloc &struct video_device * * Returns NULL if %-ENOMEM or a &struct video_device on success. */ struct video_device * __must_check video_device_alloc(void); /** * video_device_release - helper function to release &struct video_device * * @vdev: pointer to &struct video_device * * Can also be used for video_device->release\(\). */ void video_device_release(struct video_device *vdev); /** * video_device_release_empty - helper function to implement the * video_device->release\(\) callback. * * @vdev: pointer to &struct video_device * * This release function does nothing. * * It should be used when the video_device is a static global struct. * * .. note:: * Having a static video_device is a dubious construction at best. */ void video_device_release_empty(struct video_device *vdev); /** * v4l2_disable_ioctl- mark that a given command isn't implemented. * shouldn't use core locking * * @vdev: pointer to &struct video_device * @cmd: ioctl command * * This function allows drivers to provide just one v4l2_ioctl_ops struct, but * disable ioctls based on the specific card that is actually found. * * .. note:: * * This must be called before video_register_device. * See also the comments for determine_valid_ioctls(). */ static inline void v4l2_disable_ioctl(struct video_device *vdev, unsigned int cmd) { if (_IOC_NR(cmd) < BASE_VIDIOC_PRIVATE) set_bit(_IOC_NR(cmd), vdev->valid_ioctls); } /** * video_get_drvdata - gets private data from &struct video_device. * * @vdev: pointer to &struct video_device * * returns a pointer to the private data */ static inline void *video_get_drvdata(struct video_device *vdev) { return dev_get_drvdata(&vdev->dev); } /** * video_set_drvdata - sets private data from &struct video_device. * * @vdev: pointer to &struct video_device * @data: private data pointer */ static inline void video_set_drvdata(struct video_device *vdev, void *data) { dev_set_drvdata(&vdev->dev, data); } /** * video_devdata - gets &struct video_device from struct file. * * @file: pointer to struct file */ struct video_device *video_devdata(struct file *file); /** * video_drvdata - gets private data from &struct video_device using the * struct file. * * @file: pointer to struct file * * This is function combines both video_get_drvdata() and video_devdata() * as this is used very often. */ static inline void *video_drvdata(struct file *file) { return video_get_drvdata(video_devdata(file)); } /** * video_device_node_name - returns the video device name * * @vdev: pointer to &struct video_device * * Returns the device name string */ static inline const char *video_device_node_name(struct video_device *vdev) { return dev_name(&vdev->dev); } /** * video_is_registered - returns true if the &struct video_device is registered. * * * @vdev: pointer to &struct video_device */ static inline int video_is_registered(struct video_device *vdev) { return test_bit(V4L2_FL_REGISTERED, &vdev->flags); } /** * v4l2_debugfs_root - returns the dentry of the top-level "v4l2" debugfs dir * * If this directory does not yet exist, then it will be created. */ #ifdef CONFIG_DEBUG_FS struct dentry *v4l2_debugfs_root(void); #else static inline struct dentry *v4l2_debugfs_root(void) { return NULL; } #endif #if defined(CONFIG_MEDIA_CONTROLLER) /** * video_device_pipeline_start - Mark a pipeline as streaming * @vdev: Starting video device * @pipe: Media pipeline to be assigned to all entities in the pipeline. * * Mark all entities connected to a given video device through enabled links, * either directly or indirectly, as streaming. The given pipeline object is * assigned to every pad in the pipeline and stored in the media_pad pipe * field. * * Calls to this function can be nested, in which case the same number of * video_device_pipeline_stop() calls will be required to stop streaming. The * pipeline pointer must be identical for all nested calls to * video_device_pipeline_start(). * * The video device must contain a single pad. * * This is a convenience wrapper around media_pipeline_start(). */ __must_check int video_device_pipeline_start(struct video_device *vdev, struct media_pipeline *pipe); /** * __video_device_pipeline_start - Mark a pipeline as streaming * @vdev: Starting video device * @pipe: Media pipeline to be assigned to all entities in the pipeline. * * ..note:: This is the non-locking version of video_device_pipeline_start() * * The video device must contain a single pad. * * This is a convenience wrapper around __media_pipeline_start(). */ __must_check int __video_device_pipeline_start(struct video_device *vdev, struct media_pipeline *pipe); /** * video_device_pipeline_stop - Mark a pipeline as not streaming * @vdev: Starting video device * * Mark all entities connected to a given video device through enabled links, * either directly or indirectly, as not streaming. The media_pad pipe field * is reset to %NULL. * * If multiple calls to media_pipeline_start() have been made, the same * number of calls to this function are required to mark the pipeline as not * streaming. * * The video device must contain a single pad. * * This is a convenience wrapper around media_pipeline_stop(). */ void video_device_pipeline_stop(struct video_device *vdev); /** * __video_device_pipeline_stop - Mark a pipeline as not streaming * @vdev: Starting video device * * .. note:: This is the non-locking version of media_pipeline_stop() * * The video device must contain a single pad. * * This is a convenience wrapper around __media_pipeline_stop(). */ void __video_device_pipeline_stop(struct video_device *vdev); /** * video_device_pipeline_alloc_start - Mark a pipeline as streaming * @vdev: Starting video device * * video_device_pipeline_alloc_start() is similar to video_device_pipeline_start() * but instead of working on a given pipeline the function will use an * existing pipeline if the video device is already part of a pipeline, or * allocate a new pipeline. * * Calls to video_device_pipeline_alloc_start() must be matched with * video_device_pipeline_stop(). */ __must_check int video_device_pipeline_alloc_start(struct video_device *vdev); /** * video_device_pipeline - Get the media pipeline a video device is part of * @vdev: The video device * * This function returns the media pipeline that a video device has been * associated with when constructing the pipeline with * video_device_pipeline_start(). The pointer remains valid until * video_device_pipeline_stop() is called. * * Return: The media_pipeline the video device is part of, or NULL if the video * device is not part of any pipeline. * * The video device must contain a single pad. * * This is a convenience wrapper around media_entity_pipeline(). */ struct media_pipeline *video_device_pipeline(struct video_device *vdev); #endif /* CONFIG_MEDIA_CONTROLLER */ #endif /* _V4L2_DEV_H */
30 30 1 30 10 22 30 2 2 1 2 2 1 1 1 1 1 1 1 1 2 1 1 1 1 4 4 2 2 4 2 2 2 1 33 5 32 32 20 8 3 3 3 32 31 2 2 31 29 3 32 26 4 4 4 27 2 27 2 2 1 1 49 39 39 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 // SPDX-License-Identifier: GPL-2.0-only /* * Off-channel operation helpers * * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi> * Copyright 2004, Instant802 Networks, Inc. * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2019, 2022-2025 Intel Corporation */ #include <linux/export.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "driver-ops.h" /* * Tell our hardware to disable PS. * Optionally inform AP that we will go to sleep so that it will buffer * the frames while we are doing off-channel work. This is optional * because we *may* be doing work on-operating channel, and want our * hardware unconditionally awake, but still let the AP send us normal frames. */ static void ieee80211_offchannel_ps_enable(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; bool offchannel_ps_enabled = false; /* FIXME: what to do when local->pspolling is true? */ timer_delete_sync(&local->dynamic_ps_timer); timer_delete_sync(&ifmgd->bcn_mon_timer); timer_delete_sync(&ifmgd->conn_mon_timer); wiphy_work_cancel(local->hw.wiphy, &local->dynamic_ps_enable_work); if (local->hw.conf.flags & IEEE80211_CONF_PS) { offchannel_ps_enabled = true; local->hw.conf.flags &= ~IEEE80211_CONF_PS; ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } if (!offchannel_ps_enabled || !ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK)) /* * If power save was enabled, no need to send a nullfunc * frame because AP knows that we are sleeping. But if the * hardware is creating the nullfunc frame for power save * status (ie. IEEE80211_HW_PS_NULLFUNC_STACK is not * enabled) and power save was enabled, the firmware just * sent a null frame with power save disabled. So we need * to send a new nullfunc frame to inform the AP that we * are again sleeping. */ ieee80211_send_nullfunc(local, sdata, true); } /* inform AP that we are awake again */ static void ieee80211_offchannel_ps_disable(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; if (!local->ps_sdata) ieee80211_send_nullfunc(local, sdata, false); else if (local->hw.conf.dynamic_ps_timeout > 0) { /* * the dynamic_ps_timer had been running before leaving the * operating channel, restart the timer now and send a nullfunc * frame to inform the AP that we are awake so that AP sends * the buffered packets (if any). */ ieee80211_send_nullfunc(local, sdata, false); mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout)); } ieee80211_sta_reset_beacon_monitor(sdata); ieee80211_sta_reset_conn_monitor(sdata); } void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(!local->emulate_chanctx)) return; /* * notify the AP about us leaving the channel and stop all * STA interfaces. */ /* * Stop queues and transmit all frames queued by the driver * before sending nullfunc to enable powersave at the AP. */ ieee80211_stop_queues_by_reason(&local->hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_OFFCHANNEL, false); ieee80211_flush_queues(local, NULL, false); list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) continue; if (sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE || sdata->vif.type == NL80211_IFTYPE_NAN) continue; if (sdata->vif.type != NL80211_IFTYPE_MONITOR) set_bit(SDATA_STATE_OFFCHANNEL, &sdata->state); /* Check to see if we should disable beaconing. */ if (sdata->vif.bss_conf.enable_beacon) { set_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state); sdata->vif.bss_conf.enable_beacon = false; ieee80211_link_info_change_notify( sdata, &sdata->deflink, BSS_CHANGED_BEACON_ENABLED); } if (sdata->vif.type == NL80211_IFTYPE_STATION && sdata->u.mgd.associated) ieee80211_offchannel_ps_enable(sdata); } } void ieee80211_offchannel_return(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(!local->emulate_chanctx)) return; list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE) continue; if (sdata->vif.type != NL80211_IFTYPE_MONITOR) clear_bit(SDATA_STATE_OFFCHANNEL, &sdata->state); if (!ieee80211_sdata_running(sdata)) continue; /* Tell AP we're back */ if (sdata->vif.type == NL80211_IFTYPE_STATION && sdata->u.mgd.associated) ieee80211_offchannel_ps_disable(sdata); if (test_and_clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state)) { sdata->vif.bss_conf.enable_beacon = true; ieee80211_link_info_change_notify( sdata, &sdata->deflink, BSS_CHANGED_BEACON_ENABLED); } } ieee80211_wake_queues_by_reason(&local->hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_OFFCHANNEL, false); } static void ieee80211_roc_notify_destroy(struct ieee80211_roc_work *roc) { /* was never transmitted */ if (roc->frame) { cfg80211_mgmt_tx_status(&roc->sdata->wdev, roc->mgmt_tx_cookie, roc->frame->data, roc->frame->len, false, GFP_KERNEL); ieee80211_free_txskb(&roc->sdata->local->hw, roc->frame); } if (!roc->mgmt_tx_cookie) cfg80211_remain_on_channel_expired(&roc->sdata->wdev, roc->cookie, roc->chan, GFP_KERNEL); else cfg80211_tx_mgmt_expired(&roc->sdata->wdev, roc->mgmt_tx_cookie, roc->chan, GFP_KERNEL); list_del(&roc->list); kfree(roc); } static unsigned long ieee80211_end_finished_rocs(struct ieee80211_local *local, unsigned long now) { struct ieee80211_roc_work *roc, *tmp; long remaining_dur_min = LONG_MAX; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry_safe(roc, tmp, &local->roc_list, list) { long remaining; if (!roc->started) break; remaining = roc->start_time + msecs_to_jiffies(roc->duration) - now; /* In case of HW ROC, it is possible that the HW finished the * ROC session before the actual requested time. In such a case * end the ROC session (disregarding the remaining time). */ if (roc->abort || roc->hw_begun || remaining <= 0) ieee80211_roc_notify_destroy(roc); else remaining_dur_min = min(remaining_dur_min, remaining); } return remaining_dur_min; } static bool ieee80211_recalc_sw_work(struct ieee80211_local *local, unsigned long now) { long dur = ieee80211_end_finished_rocs(local, now); if (dur == LONG_MAX) return false; wiphy_delayed_work_queue(local->hw.wiphy, &local->roc_work, dur); return true; } static void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc, unsigned long start_time) { if (WARN_ON(roc->notified)) return; roc->start_time = start_time; roc->started = true; if (roc->mgmt_tx_cookie) { if (!WARN_ON(!roc->frame)) { ieee80211_tx_skb_tid_band(roc->sdata, roc->frame, 7, roc->chan->band); roc->frame = NULL; } } else { cfg80211_ready_on_channel(&roc->sdata->wdev, roc->cookie, roc->chan, roc->req_duration, GFP_KERNEL); } roc->notified = true; } static void ieee80211_hw_roc_start(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, hw_roc_start); struct ieee80211_roc_work *roc; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry(roc, &local->roc_list, list) { if (!roc->started) break; roc->hw_begun = true; ieee80211_handle_roc_started(roc, local->hw_roc_start_time); } } void ieee80211_ready_on_channel(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); local->hw_roc_start_time = jiffies; trace_api_ready_on_channel(local); wiphy_work_queue(hw->wiphy, &local->hw_roc_start); } EXPORT_SYMBOL_GPL(ieee80211_ready_on_channel); static void _ieee80211_start_next_roc(struct ieee80211_local *local) { struct ieee80211_roc_work *roc, *tmp; enum ieee80211_roc_type type; u32 min_dur, max_dur; lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(list_empty(&local->roc_list))) return; roc = list_first_entry(&local->roc_list, struct ieee80211_roc_work, list); if (WARN_ON(roc->started)) return; min_dur = roc->duration; max_dur = roc->duration; type = roc->type; list_for_each_entry(tmp, &local->roc_list, list) { if (tmp == roc) continue; if (tmp->sdata != roc->sdata || tmp->chan != roc->chan) break; max_dur = max(tmp->duration, max_dur); min_dur = min(tmp->duration, min_dur); type = max(tmp->type, type); } if (local->ops->remain_on_channel) { int ret = drv_remain_on_channel(local, roc->sdata, roc->chan, max_dur, type); if (ret) { wiphy_warn(local->hw.wiphy, "failed to start next HW ROC (%d)\n", ret); /* * queue the work struct again to avoid recursion * when multiple failures occur */ list_for_each_entry(tmp, &local->roc_list, list) { if (tmp->sdata != roc->sdata || tmp->chan != roc->chan) break; tmp->started = true; tmp->abort = true; } wiphy_work_queue(local->hw.wiphy, &local->hw_roc_done); return; } /* we'll notify about the start once the HW calls back */ list_for_each_entry(tmp, &local->roc_list, list) { if (tmp->sdata != roc->sdata || tmp->chan != roc->chan) break; tmp->started = true; } } else { /* If actually operating on the desired channel (with at least * 20 MHz channel width) don't stop all the operations but still * treat it as though the ROC operation started properly, so * other ROC operations won't interfere with this one. * * Note: scan can't run, tmp_channel is what we use, so this * must be the currently active channel. */ roc->on_channel = roc->chan == local->hw.conf.chandef.chan && local->hw.conf.chandef.width != NL80211_CHAN_WIDTH_5 && local->hw.conf.chandef.width != NL80211_CHAN_WIDTH_10; /* start this ROC */ ieee80211_recalc_idle(local); if (!roc->on_channel) { ieee80211_offchannel_stop_vifs(local); local->tmp_channel = roc->chan; ieee80211_hw_conf_chan(local); } wiphy_delayed_work_queue(local->hw.wiphy, &local->roc_work, msecs_to_jiffies(min_dur)); /* tell userspace or send frame(s) */ list_for_each_entry(tmp, &local->roc_list, list) { if (tmp->sdata != roc->sdata || tmp->chan != roc->chan) break; tmp->on_channel = roc->on_channel; ieee80211_handle_roc_started(tmp, jiffies); } } } void ieee80211_start_next_roc(struct ieee80211_local *local) { struct ieee80211_roc_work *roc; lockdep_assert_wiphy(local->hw.wiphy); if (list_empty(&local->roc_list)) { ieee80211_run_deferred_scan(local); return; } /* defer roc if driver is not started (i.e. during reconfig) */ if (local->in_reconfig) return; roc = list_first_entry(&local->roc_list, struct ieee80211_roc_work, list); if (WARN_ON_ONCE(roc->started)) return; if (local->ops->remain_on_channel) { _ieee80211_start_next_roc(local); } else { /* delay it a bit */ wiphy_delayed_work_queue(local->hw.wiphy, &local->roc_work, round_jiffies_relative(HZ / 2)); } } void ieee80211_reconfig_roc(struct ieee80211_local *local) { struct ieee80211_roc_work *roc, *tmp; /* * In the software implementation can just continue with the * interruption due to reconfig, roc_work is still queued if * needed. */ if (!local->ops->remain_on_channel) return; /* flush work so nothing from the driver is still pending */ wiphy_work_flush(local->hw.wiphy, &local->hw_roc_start); wiphy_work_flush(local->hw.wiphy, &local->hw_roc_done); list_for_each_entry_safe(roc, tmp, &local->roc_list, list) { if (!roc->started) break; if (!roc->hw_begun) { /* it didn't start in HW yet, so we can restart it */ roc->started = false; continue; } /* otherwise destroy it and tell userspace */ ieee80211_roc_notify_destroy(roc); } ieee80211_start_next_roc(local); } static void __ieee80211_roc_work(struct ieee80211_local *local) { struct ieee80211_roc_work *roc; bool on_channel; lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(local->ops->remain_on_channel)) return; roc = list_first_entry_or_null(&local->roc_list, struct ieee80211_roc_work, list); if (!roc) return; if (!roc->started) { WARN_ON(!local->emulate_chanctx); _ieee80211_start_next_roc(local); } else { on_channel = roc->on_channel; if (ieee80211_recalc_sw_work(local, jiffies)) return; /* careful - roc pointer became invalid during recalc */ if (!on_channel) { ieee80211_flush_queues(local, NULL, false); local->tmp_channel = NULL; ieee80211_hw_conf_chan(local); ieee80211_offchannel_return(local); } ieee80211_recalc_idle(local); ieee80211_start_next_roc(local); } } static void ieee80211_roc_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, roc_work.work); lockdep_assert_wiphy(local->hw.wiphy); __ieee80211_roc_work(local); } static void ieee80211_hw_roc_done(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, hw_roc_done); lockdep_assert_wiphy(local->hw.wiphy); ieee80211_end_finished_rocs(local, jiffies); /* if there's another roc, start it now */ ieee80211_start_next_roc(local); } void ieee80211_remain_on_channel_expired(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); trace_api_remain_on_channel_expired(local); wiphy_work_queue(hw->wiphy, &local->hw_roc_done); } EXPORT_SYMBOL_GPL(ieee80211_remain_on_channel_expired); static bool ieee80211_coalesce_hw_started_roc(struct ieee80211_local *local, struct ieee80211_roc_work *new_roc, struct ieee80211_roc_work *cur_roc) { unsigned long now = jiffies; unsigned long remaining; if (WARN_ON(!cur_roc->started)) return false; /* if it was scheduled in the hardware, but not started yet, * we can only combine if the older one had a longer duration */ if (!cur_roc->hw_begun && new_roc->duration > cur_roc->duration) return false; remaining = cur_roc->start_time + msecs_to_jiffies(cur_roc->duration) - now; /* if it doesn't fit entirely, schedule a new one */ if (new_roc->duration > jiffies_to_msecs(remaining)) return false; /* add just after the current one so we combine their finish later */ list_add(&new_roc->list, &cur_roc->list); /* if the existing one has already begun then let this one also * begin, otherwise they'll both be marked properly by the work * struct that runs once the driver notifies us of the beginning */ if (cur_roc->hw_begun) { new_roc->hw_begun = true; ieee80211_handle_roc_started(new_roc, now); } return true; } static int ieee80211_start_roc_work(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel, unsigned int duration, u64 *cookie, struct sk_buff *txskb, enum ieee80211_roc_type type) { struct ieee80211_roc_work *roc, *tmp; bool queued = false, combine_started = true; struct cfg80211_scan_request *req; int ret; lockdep_assert_wiphy(local->hw.wiphy); if (channel->freq_offset) /* this may work, but is untested */ return -EOPNOTSUPP; if (!local->emulate_chanctx && !local->ops->remain_on_channel) return -EOPNOTSUPP; roc = kzalloc(sizeof(*roc), GFP_KERNEL); if (!roc) return -ENOMEM; /* * If the duration is zero, then the driver * wouldn't actually do anything. Set it to * 10 for now. * * TODO: cancel the off-channel operation * when we get the SKB's TX status and * the wait time was zero before. */ if (!duration) duration = 10; roc->chan = channel; roc->duration = duration; roc->req_duration = duration; roc->frame = txskb; roc->type = type; roc->sdata = sdata; /* * cookie is either the roc cookie (for normal roc) * or the SKB (for mgmt TX) */ if (!txskb) { roc->cookie = ieee80211_mgmt_tx_cookie(local); *cookie = roc->cookie; } else { roc->mgmt_tx_cookie = *cookie; } req = wiphy_dereference(local->hw.wiphy, local->scan_req); /* if there's no need to queue, handle it immediately */ if (list_empty(&local->roc_list) && !local->scanning && !ieee80211_is_radar_required(local, req)) { /* if not HW assist, just queue & schedule work */ if (!local->ops->remain_on_channel) { list_add_tail(&roc->list, &local->roc_list); wiphy_delayed_work_queue(local->hw.wiphy, &local->roc_work, 0); } else { /* otherwise actually kick it off here * (for error handling) */ ret = drv_remain_on_channel(local, sdata, channel, duration, type); if (ret) { kfree(roc); return ret; } roc->started = true; list_add_tail(&roc->list, &local->roc_list); } return 0; } /* otherwise handle queueing */ list_for_each_entry(tmp, &local->roc_list, list) { if (tmp->chan != channel || tmp->sdata != sdata) continue; /* * Extend this ROC if possible: If it hasn't started, add * just after the new one to combine. */ if (!tmp->started) { list_add(&roc->list, &tmp->list); queued = true; break; } if (!combine_started) continue; if (!local->ops->remain_on_channel) { /* If there's no hardware remain-on-channel, and * doing so won't push us over the maximum r-o-c * we allow, then we can just add the new one to * the list and mark it as having started now. * If it would push over the limit, don't try to * combine with other started ones (that haven't * been running as long) but potentially sort it * with others that had the same fate. */ unsigned long now = jiffies; u32 elapsed = jiffies_to_msecs(now - tmp->start_time); struct wiphy *wiphy = local->hw.wiphy; u32 max_roc = wiphy->max_remain_on_channel_duration; if (elapsed + roc->duration > max_roc) { combine_started = false; continue; } list_add(&roc->list, &tmp->list); queued = true; roc->on_channel = tmp->on_channel; ieee80211_handle_roc_started(roc, now); ieee80211_recalc_sw_work(local, now); break; } queued = ieee80211_coalesce_hw_started_roc(local, roc, tmp); if (queued) break; /* if it wasn't queued, perhaps it can be combined with * another that also couldn't get combined previously, * but no need to check for already started ones, since * that can't work. */ combine_started = false; } if (!queued) list_add_tail(&roc->list, &local->roc_list); return 0; } int ieee80211_remain_on_channel(struct wiphy *wiphy, struct wireless_dev *wdev, struct ieee80211_channel *chan, unsigned int duration, u64 *cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct ieee80211_local *local = sdata->local; lockdep_assert_wiphy(local->hw.wiphy); return ieee80211_start_roc_work(local, sdata, chan, duration, cookie, NULL, IEEE80211_ROC_TYPE_NORMAL); } static int ieee80211_cancel_roc(struct ieee80211_local *local, u64 cookie, bool mgmt_tx) { struct ieee80211_roc_work *roc, *tmp, *found = NULL; int ret; lockdep_assert_wiphy(local->hw.wiphy); if (!cookie) return -ENOENT; wiphy_work_flush(local->hw.wiphy, &local->hw_roc_start); list_for_each_entry_safe(roc, tmp, &local->roc_list, list) { if (!mgmt_tx && roc->cookie != cookie) continue; else if (mgmt_tx && roc->mgmt_tx_cookie != cookie) continue; found = roc; break; } if (!found) { return -ENOENT; } if (!found->started) { ieee80211_roc_notify_destroy(found); goto out_unlock; } if (local->ops->remain_on_channel) { ret = drv_cancel_remain_on_channel(local, roc->sdata); if (WARN_ON_ONCE(ret)) { return ret; } /* * We could be racing against the notification from the driver: * + driver is handling the notification on CPU0 * + user space is cancelling the remain on channel and * schedules the hw_roc_done worker. * * Now hw_roc_done might start to run after the next roc will * start and mac80211 will think that this second roc has * ended prematurely. * Cancel the work to make sure that all the pending workers * have completed execution. * Note that this assumes that by the time the driver returns * from drv_cancel_remain_on_channel, it has completed all * the processing of related notifications. */ wiphy_work_cancel(local->hw.wiphy, &local->hw_roc_done); /* TODO: * if multiple items were combined here then we really shouldn't * cancel them all - we should wait for as much time as needed * for the longest remaining one, and only then cancel ... */ list_for_each_entry_safe(roc, tmp, &local->roc_list, list) { if (!roc->started) break; if (roc == found) found = NULL; ieee80211_roc_notify_destroy(roc); } /* that really must not happen - it was started */ WARN_ON(found); ieee80211_start_next_roc(local); } else { /* go through work struct to return to the operating channel */ found->abort = true; wiphy_delayed_work_queue(local->hw.wiphy, &local->roc_work, 0); } out_unlock: return 0; } int ieee80211_cancel_remain_on_channel(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct ieee80211_local *local = sdata->local; return ieee80211_cancel_roc(local, cookie, false); } int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_mgmt_tx_params *params, u64 *cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct sta_info *sta = NULL; const struct ieee80211_mgmt *mgmt = (void *)params->buf; bool need_offchan = false; bool mlo_sta = false; int link_id = -1; u32 flags; int ret; u8 *data; lockdep_assert_wiphy(local->hw.wiphy); if (params->dont_wait_for_ack) flags = IEEE80211_TX_CTL_NO_ACK; else flags = IEEE80211_TX_INTFL_NL80211_FRAME_TX | IEEE80211_TX_CTL_REQ_TX_STATUS; if (params->no_cck) flags |= IEEE80211_TX_CTL_NO_CCK_RATE; switch (sdata->vif.type) { case NL80211_IFTYPE_ADHOC: if (!sdata->vif.cfg.ibss_joined) need_offchan = true; #ifdef CONFIG_MAC80211_MESH fallthrough; case NL80211_IFTYPE_MESH_POINT: if (ieee80211_vif_is_mesh(&sdata->vif) && !sdata->u.mesh.mesh_id_len) need_offchan = true; #endif fallthrough; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_P2P_GO: if (sdata->vif.type != NL80211_IFTYPE_ADHOC && !ieee80211_vif_is_mesh(&sdata->vif) && !sdata->bss->active) need_offchan = true; rcu_read_lock(); sta = sta_info_get_bss(sdata, mgmt->da); mlo_sta = sta && sta->sta.mlo; if (!ieee80211_is_action(mgmt->frame_control) || mgmt->u.action.category == WLAN_CATEGORY_PUBLIC || mgmt->u.action.category == WLAN_CATEGORY_SELF_PROTECTED || mgmt->u.action.category == WLAN_CATEGORY_SPECTRUM_MGMT) { rcu_read_unlock(); break; } if (!sta) { rcu_read_unlock(); return -ENOLINK; } if (params->link_id >= 0 && !(sta->sta.valid_links & BIT(params->link_id))) { rcu_read_unlock(); return -ENOLINK; } link_id = params->link_id; rcu_read_unlock(); break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: if (!sdata->u.mgd.associated || (params->offchan && params->wait && local->ops->remain_on_channel && memcmp(sdata->vif.cfg.ap_addr, mgmt->bssid, ETH_ALEN))) { need_offchan = true; } else if (sdata->u.mgd.associated && ether_addr_equal(sdata->vif.cfg.ap_addr, mgmt->da)) { sta = sta_info_get_bss(sdata, mgmt->da); mlo_sta = sta && sta->sta.mlo; } break; case NL80211_IFTYPE_P2P_DEVICE: need_offchan = true; break; case NL80211_IFTYPE_NAN: break; default: return -EOPNOTSUPP; } /* configurations requiring offchan cannot work if no channel has been * specified */ if (need_offchan && !params->chan) return -EINVAL; /* Check if the operating channel is the requested channel */ if (!params->chan && mlo_sta) { need_offchan = false; } else if (sdata->vif.type == NL80211_IFTYPE_NAN) { /* Frames can be sent during NAN schedule */ } else if (!need_offchan) { struct ieee80211_chanctx_conf *chanctx_conf = NULL; int i; rcu_read_lock(); /* Check all the links first */ for (i = 0; i < ARRAY_SIZE(sdata->vif.link_conf); i++) { struct ieee80211_bss_conf *conf; conf = rcu_dereference(sdata->vif.link_conf[i]); if (!conf) continue; chanctx_conf = rcu_dereference(conf->chanctx_conf); if (!chanctx_conf) continue; if (mlo_sta && params->chan == chanctx_conf->def.chan && ether_addr_equal(sdata->vif.addr, mgmt->sa)) { link_id = i; break; } if (ether_addr_equal(conf->addr, mgmt->sa)) { /* If userspace requested Tx on a specific link * use the same link id if the link bss is matching * the requested chan. */ if (sdata->vif.valid_links && params->link_id >= 0 && params->link_id == i && params->chan == chanctx_conf->def.chan) link_id = i; break; } chanctx_conf = NULL; } if (chanctx_conf) { need_offchan = params->chan && (params->chan != chanctx_conf->def.chan); } else { need_offchan = true; } rcu_read_unlock(); } if (need_offchan && !params->offchan) { ret = -EBUSY; goto out_unlock; } skb = dev_alloc_skb(local->hw.extra_tx_headroom + params->len); if (!skb) { ret = -ENOMEM; goto out_unlock; } skb_reserve(skb, local->hw.extra_tx_headroom); data = skb_put_data(skb, params->buf, params->len); /* Update CSA counters */ if (sdata->vif.bss_conf.csa_active && (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_MESH_POINT || sdata->vif.type == NL80211_IFTYPE_ADHOC) && params->n_csa_offsets) { int i; struct beacon_data *beacon = NULL; rcu_read_lock(); if (sdata->vif.type == NL80211_IFTYPE_AP) beacon = rcu_dereference(sdata->deflink.u.ap.beacon); else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) beacon = rcu_dereference(sdata->u.ibss.presp); else if (ieee80211_vif_is_mesh(&sdata->vif)) beacon = rcu_dereference(sdata->u.mesh.beacon); if (beacon) for (i = 0; i < params->n_csa_offsets; i++) data[params->csa_offsets[i]] = beacon->cntdwn_current_counter; rcu_read_unlock(); } IEEE80211_SKB_CB(skb)->flags = flags; IEEE80211_SKB_CB(skb)->control.flags |= IEEE80211_TX_CTRL_DONT_USE_RATE_MASK; skb->dev = sdata->dev; if (!params->dont_wait_for_ack) { /* make a copy to preserve the frame contents * in case of encryption. */ ret = ieee80211_attach_ack_skb(local, skb, cookie, GFP_KERNEL); if (ret) { kfree_skb(skb); goto out_unlock; } } else { /* Assign a dummy non-zero cookie, it's not sent to * userspace in this case but we rely on its value * internally in the need_offchan case to distinguish * mgmt-tx from remain-on-channel. */ *cookie = 0xffffffff; } if (!need_offchan) { ieee80211_tx_skb_tid(sdata, skb, 7, link_id); ret = 0; goto out_unlock; } IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_TX_OFFCHAN | IEEE80211_TX_INTFL_OFFCHAN_TX_OK; if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) IEEE80211_SKB_CB(skb)->hw_queue = local->hw.offchannel_tx_hw_queue; /* This will handle all kinds of coalescing and immediate TX */ ret = ieee80211_start_roc_work(local, sdata, params->chan, params->wait, cookie, skb, IEEE80211_ROC_TYPE_MGMT_TX); if (ret) ieee80211_free_txskb(&local->hw, skb); out_unlock: return ret; } int ieee80211_mgmt_tx_cancel_wait(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie) { struct ieee80211_local *local = wiphy_priv(wiphy); return ieee80211_cancel_roc(local, cookie, true); } void ieee80211_roc_setup(struct ieee80211_local *local) { wiphy_work_init(&local->hw_roc_start, ieee80211_hw_roc_start); wiphy_work_init(&local->hw_roc_done, ieee80211_hw_roc_done); wiphy_delayed_work_init(&local->roc_work, ieee80211_roc_work); INIT_LIST_HEAD(&local->roc_list); } void ieee80211_roc_purge(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { struct ieee80211_roc_work *roc, *tmp; bool work_to_do = false; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry_safe(roc, tmp, &local->roc_list, list) { if (sdata && roc->sdata != sdata) continue; if (roc->started) { if (local->ops->remain_on_channel) { /* can race, so ignore return value */ drv_cancel_remain_on_channel(local, roc->sdata); ieee80211_roc_notify_destroy(roc); } else { roc->abort = true; work_to_do = true; } } else { ieee80211_roc_notify_destroy(roc); } } if (work_to_do) __ieee80211_roc_work(local); }
3 9 14 3 7 1 6 7 37 3 3 7 7 6 6 5 5 7 5 5 9 9 4 3 3 2 2 5 3 3 3 11 11 8 11 1 1 9 9 1 8 1 6 1 1 2 22 1 11 7 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 // SPDX-License-Identifier: GPL-2.0-only /* * Sync File validation framework * * Copyright (C) 2012 Google, Inc. */ #include <linux/file.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/slab.h> #include <linux/sync_file.h> #include "sync_debug.h" #define CREATE_TRACE_POINTS #include "sync_trace.h" /* * SW SYNC validation framework * * A sync object driver that uses a 32bit counter to coordinate * synchronization. Useful when there is no hardware primitive backing * the synchronization. * * To start the framework just open: * * <debugfs>/sync/sw_sync * * That will create a sync timeline, all fences created under this timeline * file descriptor will belong to the this timeline. * * The 'sw_sync' file can be opened many times as to create different * timelines. * * Fences can be created with SW_SYNC_IOC_CREATE_FENCE ioctl with struct * sw_sync_create_fence_data as parameter. * * To increment the timeline counter, SW_SYNC_IOC_INC ioctl should be used * with the increment as u32. This will update the last signaled value * from the timeline and signal any fence that has a seqno smaller or equal * to it. * * struct sw_sync_create_fence_data * @value: the seqno to initialise the fence with * @name: the name of the new sync point * @fence: return the fd of the new sync_file with the created fence */ struct sw_sync_create_fence_data { __u32 value; char name[32]; __s32 fence; /* fd of new fence */ }; /** * struct sw_sync_get_deadline - get the deadline hint of a sw_sync fence * @deadline_ns: absolute time of the deadline * @pad: must be zero * @fence_fd: the sw_sync fence fd (in) * * Return the earliest deadline set on the fence. The timebase for the * deadline is CLOCK_MONOTONIC (same as vblank). If there is no deadline * set on the fence, this ioctl will return -ENOENT. */ struct sw_sync_get_deadline { __u64 deadline_ns; __u32 pad; __s32 fence_fd; }; #define SW_SYNC_IOC_MAGIC 'W' #define SW_SYNC_IOC_CREATE_FENCE _IOWR(SW_SYNC_IOC_MAGIC, 0,\ struct sw_sync_create_fence_data) #define SW_SYNC_IOC_INC _IOW(SW_SYNC_IOC_MAGIC, 1, __u32) #define SW_SYNC_GET_DEADLINE _IOWR(SW_SYNC_IOC_MAGIC, 2, \ struct sw_sync_get_deadline) #define SW_SYNC_HAS_DEADLINE_BIT DMA_FENCE_FLAG_USER_BITS static const struct dma_fence_ops timeline_fence_ops; static inline struct sync_pt *dma_fence_to_sync_pt(struct dma_fence *fence) { if (fence->ops != &timeline_fence_ops) return NULL; return container_of(fence, struct sync_pt, base); } /** * sync_timeline_create() - creates a sync object * @name: sync_timeline name * * Creates a new sync_timeline. Returns the sync_timeline object or NULL in * case of error. */ static struct sync_timeline *sync_timeline_create(const char *name) { struct sync_timeline *obj; obj = kzalloc(sizeof(*obj), GFP_KERNEL); if (!obj) return NULL; kref_init(&obj->kref); obj->context = dma_fence_context_alloc(1); strscpy(obj->name, name, sizeof(obj->name)); obj->pt_tree = RB_ROOT; INIT_LIST_HEAD(&obj->pt_list); spin_lock_init(&obj->lock); sync_timeline_debug_add(obj); return obj; } static void sync_timeline_free(struct kref *kref) { struct sync_timeline *obj = container_of(kref, struct sync_timeline, kref); sync_timeline_debug_remove(obj); kfree(obj); } static void sync_timeline_get(struct sync_timeline *obj) { kref_get(&obj->kref); } static void sync_timeline_put(struct sync_timeline *obj) { kref_put(&obj->kref, sync_timeline_free); } static const char *timeline_fence_get_driver_name(struct dma_fence *fence) { return "sw_sync"; } static const char *timeline_fence_get_timeline_name(struct dma_fence *fence) { struct sync_timeline *parent = dma_fence_parent(fence); return parent->name; } static void timeline_fence_release(struct dma_fence *fence) { struct sync_pt *pt = dma_fence_to_sync_pt(fence); struct sync_timeline *parent = dma_fence_parent(fence); unsigned long flags; spin_lock_irqsave(fence->lock, flags); if (!list_empty(&pt->link)) { list_del(&pt->link); rb_erase(&pt->node, &parent->pt_tree); } spin_unlock_irqrestore(fence->lock, flags); sync_timeline_put(parent); dma_fence_free(fence); } static bool timeline_fence_signaled(struct dma_fence *fence) { struct sync_timeline *parent = dma_fence_parent(fence); return !__dma_fence_is_later(fence, fence->seqno, parent->value); } static void timeline_fence_set_deadline(struct dma_fence *fence, ktime_t deadline) { struct sync_pt *pt = dma_fence_to_sync_pt(fence); unsigned long flags; spin_lock_irqsave(fence->lock, flags); if (test_bit(SW_SYNC_HAS_DEADLINE_BIT, &fence->flags)) { if (ktime_before(deadline, pt->deadline)) pt->deadline = deadline; } else { pt->deadline = deadline; __set_bit(SW_SYNC_HAS_DEADLINE_BIT, &fence->flags); } spin_unlock_irqrestore(fence->lock, flags); } static const struct dma_fence_ops timeline_fence_ops = { .get_driver_name = timeline_fence_get_driver_name, .get_timeline_name = timeline_fence_get_timeline_name, .signaled = timeline_fence_signaled, .release = timeline_fence_release, .set_deadline = timeline_fence_set_deadline, }; /** * sync_timeline_signal() - signal a status change on a sync_timeline * @obj: sync_timeline to signal * @inc: num to increment on timeline->value * * A sync implementation should call this any time one of it's fences * has signaled or has an error condition. */ static void sync_timeline_signal(struct sync_timeline *obj, unsigned int inc) { LIST_HEAD(signalled); struct sync_pt *pt, *next; trace_sync_timeline(obj); spin_lock_irq(&obj->lock); obj->value += inc; list_for_each_entry_safe(pt, next, &obj->pt_list, link) { if (!timeline_fence_signaled(&pt->base)) break; dma_fence_get(&pt->base); list_move_tail(&pt->link, &signalled); rb_erase(&pt->node, &obj->pt_tree); dma_fence_signal_locked(&pt->base); } spin_unlock_irq(&obj->lock); list_for_each_entry_safe(pt, next, &signalled, link) { list_del_init(&pt->link); dma_fence_put(&pt->base); } } /** * sync_pt_create() - creates a sync pt * @obj: parent sync_timeline * @value: value of the fence * * Creates a new sync_pt (fence) as a child of @parent. @size bytes will be * allocated allowing for implementation specific data to be kept after * the generic sync_timeline struct. Returns the sync_pt object or * NULL in case of error. */ static struct sync_pt *sync_pt_create(struct sync_timeline *obj, unsigned int value) { struct sync_pt *pt; pt = kzalloc(sizeof(*pt), GFP_KERNEL); if (!pt) return NULL; sync_timeline_get(obj); dma_fence_init(&pt->base, &timeline_fence_ops, &obj->lock, obj->context, value); INIT_LIST_HEAD(&pt->link); spin_lock_irq(&obj->lock); if (!dma_fence_is_signaled_locked(&pt->base)) { struct rb_node **p = &obj->pt_tree.rb_node; struct rb_node *parent = NULL; while (*p) { struct sync_pt *other; int cmp; parent = *p; other = rb_entry(parent, typeof(*pt), node); cmp = value - other->base.seqno; if (cmp > 0) { p = &parent->rb_right; } else if (cmp < 0) { p = &parent->rb_left; } else { if (dma_fence_get_rcu(&other->base)) { sync_timeline_put(obj); kfree(pt); pt = other; goto unlock; } p = &parent->rb_left; } } rb_link_node(&pt->node, parent, p); rb_insert_color(&pt->node, &obj->pt_tree); parent = rb_next(&pt->node); list_add_tail(&pt->link, parent ? &rb_entry(parent, typeof(*pt), node)->link : &obj->pt_list); } unlock: spin_unlock_irq(&obj->lock); return pt; } /* * *WARNING* * * improper use of this can result in deadlocking kernel drivers from userspace. */ /* opening sw_sync create a new sync obj */ static int sw_sync_debugfs_open(struct inode *inode, struct file *file) { struct sync_timeline *obj; char task_comm[TASK_COMM_LEN]; get_task_comm(task_comm, current); obj = sync_timeline_create(task_comm); if (!obj) return -ENOMEM; file->private_data = obj; return 0; } static int sw_sync_debugfs_release(struct inode *inode, struct file *file) { struct sync_timeline *obj = file->private_data; struct sync_pt *pt, *next; spin_lock_irq(&obj->lock); list_for_each_entry_safe(pt, next, &obj->pt_list, link) { dma_fence_set_error(&pt->base, -ENOENT); dma_fence_signal_locked(&pt->base); } spin_unlock_irq(&obj->lock); sync_timeline_put(obj); return 0; } static long sw_sync_ioctl_create_fence(struct sync_timeline *obj, unsigned long arg) { int fd = get_unused_fd_flags(O_CLOEXEC); int err; struct sync_pt *pt; struct sync_file *sync_file; struct sw_sync_create_fence_data data; if (fd < 0) return fd; if (copy_from_user(&data, (void __user *)arg, sizeof(data))) { err = -EFAULT; goto err; } pt = sync_pt_create(obj, data.value); if (!pt) { err = -ENOMEM; goto err; } sync_file = sync_file_create(&pt->base); dma_fence_put(&pt->base); if (!sync_file) { err = -ENOMEM; goto err; } data.fence = fd; if (copy_to_user((void __user *)arg, &data, sizeof(data))) { fput(sync_file->file); err = -EFAULT; goto err; } fd_install(fd, sync_file->file); return 0; err: put_unused_fd(fd); return err; } static long sw_sync_ioctl_inc(struct sync_timeline *obj, unsigned long arg) { u32 value; if (copy_from_user(&value, (void __user *)arg, sizeof(value))) return -EFAULT; while (value > INT_MAX) { sync_timeline_signal(obj, INT_MAX); value -= INT_MAX; } sync_timeline_signal(obj, value); return 0; } static int sw_sync_ioctl_get_deadline(struct sync_timeline *obj, unsigned long arg) { struct sw_sync_get_deadline data; struct dma_fence *fence; unsigned long flags; struct sync_pt *pt; int ret = 0; if (copy_from_user(&data, (void __user *)arg, sizeof(data))) return -EFAULT; if (data.deadline_ns || data.pad) return -EINVAL; fence = sync_file_get_fence(data.fence_fd); if (!fence) return -EINVAL; pt = dma_fence_to_sync_pt(fence); if (!pt) { ret = -EINVAL; goto put_fence; } spin_lock_irqsave(fence->lock, flags); if (!test_bit(SW_SYNC_HAS_DEADLINE_BIT, &fence->flags)) { ret = -ENOENT; goto unlock; } data.deadline_ns = ktime_to_ns(pt->deadline); spin_unlock_irqrestore(fence->lock, flags); dma_fence_put(fence); if (ret) return ret; if (copy_to_user((void __user *)arg, &data, sizeof(data))) return -EFAULT; return 0; unlock: spin_unlock_irqrestore(fence->lock, flags); put_fence: dma_fence_put(fence); return ret; } static long sw_sync_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct sync_timeline *obj = file->private_data; switch (cmd) { case SW_SYNC_IOC_CREATE_FENCE: return sw_sync_ioctl_create_fence(obj, arg); case SW_SYNC_IOC_INC: return sw_sync_ioctl_inc(obj, arg); case SW_SYNC_GET_DEADLINE: return sw_sync_ioctl_get_deadline(obj, arg); default: return -ENOTTY; } } const struct file_operations sw_sync_debugfs_fops = { .open = sw_sync_debugfs_open, .release = sw_sync_debugfs_release, .unlocked_ioctl = sw_sync_ioctl, .compat_ioctl = compat_ptr_ioctl, };
2 1 43 43 41 2 5 5 4 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 // SPDX-License-Identifier: GPL-2.0-only /* * Vxlan multicast group handling * */ #include <linux/kernel.h> #include <net/net_namespace.h> #include <net/sock.h> #include <linux/igmp.h> #include <net/vxlan.h> #include "vxlan_private.h" /* Update multicast group membership when first VNI on * multicast address is brought up */ int vxlan_igmp_join(struct vxlan_dev *vxlan, union vxlan_addr *rip, int rifindex) { union vxlan_addr *ip = (rip ? : &vxlan->default_dst.remote_ip); int ifindex = (rifindex ? : vxlan->default_dst.remote_ifindex); int ret = -EINVAL; struct sock *sk; if (ip->sa.sa_family == AF_INET) { struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock); struct ip_mreqn mreq = { .imr_multiaddr.s_addr = ip->sin.sin_addr.s_addr, .imr_ifindex = ifindex, }; sk = sock4->sock->sk; lock_sock(sk); ret = ip_mc_join_group(sk, &mreq); release_sock(sk); #if IS_ENABLED(CONFIG_IPV6) } else { struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock); sk = sock6->sock->sk; lock_sock(sk); ret = ipv6_stub->ipv6_sock_mc_join(sk, ifindex, &ip->sin6.sin6_addr); release_sock(sk); #endif } return ret; } int vxlan_igmp_leave(struct vxlan_dev *vxlan, union vxlan_addr *rip, int rifindex) { union vxlan_addr *ip = (rip ? : &vxlan->default_dst.remote_ip); int ifindex = (rifindex ? : vxlan->default_dst.remote_ifindex); int ret = -EINVAL; struct sock *sk; if (ip->sa.sa_family == AF_INET) { struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock); struct ip_mreqn mreq = { .imr_multiaddr.s_addr = ip->sin.sin_addr.s_addr, .imr_ifindex = ifindex, }; sk = sock4->sock->sk; lock_sock(sk); ret = ip_mc_leave_group(sk, &mreq); release_sock(sk); #if IS_ENABLED(CONFIG_IPV6) } else { struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock); sk = sock6->sock->sk; lock_sock(sk); ret = ipv6_stub->ipv6_sock_mc_drop(sk, ifindex, &ip->sin6.sin6_addr); release_sock(sk); #endif } return ret; } static bool vxlan_group_used_match(union vxlan_addr *ip, int ifindex, union vxlan_addr *rip, int rifindex) { if (!vxlan_addr_multicast(rip)) return false; if (!vxlan_addr_equal(rip, ip)) return false; if (rifindex != ifindex) return false; return true; } static bool vxlan_group_used_by_vnifilter(struct vxlan_dev *vxlan, union vxlan_addr *ip, int ifindex) { struct vxlan_vni_group *vg = rtnl_dereference(vxlan->vnigrp); struct vxlan_vni_node *v, *tmp; if (vxlan_group_used_match(ip, ifindex, &vxlan->default_dst.remote_ip, vxlan->default_dst.remote_ifindex)) return true; list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) { if (!vxlan_addr_multicast(&v->remote_ip)) continue; if (vxlan_group_used_match(ip, ifindex, &v->remote_ip, vxlan->default_dst.remote_ifindex)) return true; } return false; } /* See if multicast group is already in use by other ID */ bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev, __be32 vni, union vxlan_addr *rip, int rifindex) { union vxlan_addr *ip = (rip ? : &dev->default_dst.remote_ip); int ifindex = (rifindex ? : dev->default_dst.remote_ifindex); struct vxlan_dev *vxlan; struct vxlan_sock *sock4; #if IS_ENABLED(CONFIG_IPV6) struct vxlan_sock *sock6; #endif unsigned short family = dev->default_dst.remote_ip.sa.sa_family; sock4 = rtnl_dereference(dev->vn4_sock); /* The vxlan_sock is only used by dev, leaving group has * no effect on other vxlan devices. */ if (family == AF_INET && sock4 && refcount_read(&sock4->refcnt) == 1) return false; #if IS_ENABLED(CONFIG_IPV6) sock6 = rtnl_dereference(dev->vn6_sock); if (family == AF_INET6 && sock6 && refcount_read(&sock6->refcnt) == 1) return false; #endif list_for_each_entry(vxlan, &vn->vxlan_list, next) { if (!netif_running(vxlan->dev) || vxlan == dev) continue; if (family == AF_INET && rtnl_dereference(vxlan->vn4_sock) != sock4) continue; #if IS_ENABLED(CONFIG_IPV6) if (family == AF_INET6 && rtnl_dereference(vxlan->vn6_sock) != sock6) continue; #endif if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) { if (!vxlan_group_used_by_vnifilter(vxlan, ip, ifindex)) continue; } else { if (!vxlan_group_used_match(ip, ifindex, &vxlan->default_dst.remote_ip, vxlan->default_dst.remote_ifindex)) continue; } return true; } return false; } static int vxlan_multicast_join_vnigrp(struct vxlan_dev *vxlan) { struct vxlan_vni_group *vg = rtnl_dereference(vxlan->vnigrp); struct vxlan_vni_node *v, *tmp, *vgood = NULL; int ret = 0; list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) { if (!vxlan_addr_multicast(&v->remote_ip)) continue; /* skip if address is same as default address */ if (vxlan_addr_equal(&v->remote_ip, &vxlan->default_dst.remote_ip)) continue; ret = vxlan_igmp_join(vxlan, &v->remote_ip, 0); if (ret == -EADDRINUSE) ret = 0; if (ret) goto out; vgood = v; } out: if (ret) { list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) { if (!vxlan_addr_multicast(&v->remote_ip)) continue; if (vxlan_addr_equal(&v->remote_ip, &vxlan->default_dst.remote_ip)) continue; vxlan_igmp_leave(vxlan, &v->remote_ip, 0); if (v == vgood) break; } } return ret; } static int vxlan_multicast_leave_vnigrp(struct vxlan_dev *vxlan) { struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); struct vxlan_vni_group *vg = rtnl_dereference(vxlan->vnigrp); struct vxlan_vni_node *v, *tmp; int last_err = 0, ret; list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) { if (vxlan_addr_multicast(&v->remote_ip) && !vxlan_group_used(vn, vxlan, v->vni, &v->remote_ip, 0)) { ret = vxlan_igmp_leave(vxlan, &v->remote_ip, 0); if (ret) last_err = ret; } } return last_err; } int vxlan_multicast_join(struct vxlan_dev *vxlan) { int ret = 0; if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) { ret = vxlan_igmp_join(vxlan, &vxlan->default_dst.remote_ip, vxlan->default_dst.remote_ifindex); if (ret == -EADDRINUSE) ret = 0; if (ret) return ret; } if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) return vxlan_multicast_join_vnigrp(vxlan); return 0; } int vxlan_multicast_leave(struct vxlan_dev *vxlan) { struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); int ret = 0; if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip) && !vxlan_group_used(vn, vxlan, 0, NULL, 0)) { ret = vxlan_igmp_leave(vxlan, &vxlan->default_dst.remote_ip, vxlan->default_dst.remote_ifindex); if (ret) return ret; } if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) return vxlan_multicast_leave_vnigrp(vxlan); return 0; }
8 1 8 8 8 8 8 8 8 8 5 8 8 8 5 8 8 8 5 5 5 5 1 1 5 5 9 2 8 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> #include <linux/kallsyms.h> #include <linux/gfs2_ondisk.h> #include "gfs2.h" #include "incore.h" #include "glock.h" #include "inode.h" #include "log.h" #include "lops.h" #include "meta_io.h" #include "trans.h" #include "util.h" #include "trace_gfs2.h" static void gfs2_print_trans(struct gfs2_sbd *sdp, const struct gfs2_trans *tr) { fs_warn(sdp, "Transaction created at: %pSR\n", (void *)tr->tr_ip); fs_warn(sdp, "blocks=%u revokes=%u reserved=%u touched=%u\n", tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, test_bit(TR_TOUCHED, &tr->tr_flags)); fs_warn(sdp, "Buf %u/%u Databuf %u/%u Revoke %u\n", tr->tr_num_buf_new, tr->tr_num_buf_rm, tr->tr_num_databuf_new, tr->tr_num_databuf_rm, tr->tr_num_revoke); } int __gfs2_trans_begin(struct gfs2_trans *tr, struct gfs2_sbd *sdp, unsigned int blocks, unsigned int revokes, unsigned long ip) { unsigned int extra_revokes; if (current->journal_info) { gfs2_print_trans(sdp, current->journal_info); BUG(); } BUG_ON(blocks == 0 && revokes == 0); if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) return -EROFS; tr->tr_ip = ip; tr->tr_blocks = blocks; tr->tr_revokes = revokes; tr->tr_reserved = GFS2_LOG_FLUSH_MIN_BLOCKS; if (blocks) { /* * The reserved blocks are either used for data or metadata. * We can have mixed data and metadata, each with its own log * descriptor block; see calc_reserved(). */ tr->tr_reserved += blocks + 1 + DIV_ROUND_UP(blocks - 1, databuf_limit(sdp)); } INIT_LIST_HEAD(&tr->tr_databuf); INIT_LIST_HEAD(&tr->tr_buf); INIT_LIST_HEAD(&tr->tr_list); INIT_LIST_HEAD(&tr->tr_ail1_list); INIT_LIST_HEAD(&tr->tr_ail2_list); if (gfs2_assert_warn(sdp, tr->tr_reserved <= sdp->sd_jdesc->jd_blocks)) return -EINVAL; sb_start_intwrite(sdp->sd_vfs); /* * Try the reservations under sd_log_flush_lock to prevent log flushes * from creating inconsistencies between the number of allocated and * reserved revokes. If that fails, do a full-block allocation outside * of the lock to avoid stalling log flushes. Then, allot the * appropriate number of blocks to revokes, use as many revokes locally * as needed, and "release" the surplus into the revokes pool. */ down_read(&sdp->sd_log_flush_lock); if (gfs2_log_try_reserve(sdp, tr, &extra_revokes)) goto reserved; up_read(&sdp->sd_log_flush_lock); gfs2_log_reserve(sdp, tr, &extra_revokes); down_read(&sdp->sd_log_flush_lock); reserved: gfs2_log_release_revokes(sdp, extra_revokes); if (unlikely(!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))) { gfs2_log_release_revokes(sdp, tr->tr_revokes); up_read(&sdp->sd_log_flush_lock); gfs2_log_release(sdp, tr->tr_reserved); sb_end_intwrite(sdp->sd_vfs); return -EROFS; } current->journal_info = tr; return 0; } int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, unsigned int revokes) { struct gfs2_trans *tr; int error; tr = kmem_cache_zalloc(gfs2_trans_cachep, GFP_NOFS); if (!tr) return -ENOMEM; error = __gfs2_trans_begin(tr, sdp, blocks, revokes, _RET_IP_); if (error) kmem_cache_free(gfs2_trans_cachep, tr); return error; } void gfs2_trans_end(struct gfs2_sbd *sdp) { struct gfs2_trans *tr = current->journal_info; s64 nbuf; current->journal_info = NULL; if (!test_bit(TR_TOUCHED, &tr->tr_flags)) { gfs2_log_release_revokes(sdp, tr->tr_revokes); up_read(&sdp->sd_log_flush_lock); gfs2_log_release(sdp, tr->tr_reserved); if (!test_bit(TR_ONSTACK, &tr->tr_flags)) gfs2_trans_free(sdp, tr); sb_end_intwrite(sdp->sd_vfs); return; } gfs2_log_release_revokes(sdp, tr->tr_revokes - tr->tr_num_revoke); nbuf = tr->tr_num_buf_new + tr->tr_num_databuf_new; nbuf -= tr->tr_num_buf_rm; nbuf -= tr->tr_num_databuf_rm; if (gfs2_assert_withdraw(sdp, nbuf <= tr->tr_blocks) || gfs2_assert_withdraw(sdp, tr->tr_num_revoke <= tr->tr_revokes)) gfs2_print_trans(sdp, tr); gfs2_log_commit(sdp, tr); if (!test_bit(TR_ONSTACK, &tr->tr_flags) && !test_bit(TR_ATTACHED, &tr->tr_flags)) gfs2_trans_free(sdp, tr); up_read(&sdp->sd_log_flush_lock); if (sdp->sd_vfs->s_flags & SB_SYNCHRONOUS) gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_TRANS_END); sb_end_intwrite(sdp->sd_vfs); } static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl, struct buffer_head *bh) { struct gfs2_bufdata *bd; bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL); bd->bd_bh = bh; bd->bd_gl = gl; INIT_LIST_HEAD(&bd->bd_list); INIT_LIST_HEAD(&bd->bd_ail_st_list); INIT_LIST_HEAD(&bd->bd_ail_gl_list); bh->b_private = bd; return bd; } /** * gfs2_trans_add_data - Add a databuf to the transaction. * @gl: The inode glock associated with the buffer * @bh: The buffer to add * * This is used in journaled data mode. * We need to journal the data block in the same way as metadata in * the functions above. The difference is that here we have a tag * which is two __be64's being the block number (as per meta data) * and a flag which says whether the data block needs escaping or * not. This means we need a new log entry for each 251 or so data * blocks, which isn't an enormous overhead but twice as much as * for normal metadata blocks. */ void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh) { struct gfs2_trans *tr = current->journal_info; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct gfs2_bufdata *bd; lock_buffer(bh); if (buffer_pinned(bh)) { set_bit(TR_TOUCHED, &tr->tr_flags); goto out; } gfs2_log_lock(sdp); bd = bh->b_private; if (bd == NULL) { gfs2_log_unlock(sdp); unlock_buffer(bh); if (bh->b_private == NULL) bd = gfs2_alloc_bufdata(gl, bh); else bd = bh->b_private; lock_buffer(bh); gfs2_log_lock(sdp); } gfs2_assert(sdp, bd->bd_gl == gl); set_bit(TR_TOUCHED, &tr->tr_flags); if (list_empty(&bd->bd_list)) { set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); gfs2_pin(sdp, bd->bd_bh); tr->tr_num_databuf_new++; list_add_tail(&bd->bd_list, &tr->tr_databuf); } gfs2_log_unlock(sdp); out: unlock_buffer(bh); } void gfs2_trans_add_databufs(struct gfs2_glock *gl, struct folio *folio, size_t from, size_t len) { struct buffer_head *head = folio_buffers(folio); unsigned int bsize = head->b_size; struct buffer_head *bh; size_t to = from + len; size_t start, end; for (bh = head, start = 0; bh != head || !start; bh = bh->b_this_page, start = end) { end = start + bsize; if (end <= from) continue; if (start >= to) break; set_buffer_uptodate(bh); gfs2_trans_add_data(gl, bh); } } void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct super_block *sb = sdp->sd_vfs; struct gfs2_bufdata *bd; struct gfs2_meta_header *mh; struct gfs2_trans *tr = current->journal_info; bool withdraw = false; lock_buffer(bh); if (buffer_pinned(bh)) { set_bit(TR_TOUCHED, &tr->tr_flags); goto out; } gfs2_log_lock(sdp); bd = bh->b_private; if (bd == NULL) { gfs2_log_unlock(sdp); unlock_buffer(bh); folio_lock(bh->b_folio); if (bh->b_private == NULL) bd = gfs2_alloc_bufdata(gl, bh); else bd = bh->b_private; folio_unlock(bh->b_folio); lock_buffer(bh); gfs2_log_lock(sdp); } gfs2_assert(sdp, bd->bd_gl == gl); set_bit(TR_TOUCHED, &tr->tr_flags); if (!list_empty(&bd->bd_list)) goto out_unlock; set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); mh = (struct gfs2_meta_header *)bd->bd_bh->b_data; if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) { fs_err(sdp, "Attempting to add uninitialised block to " "journal (inplace block=%lld)\n", (unsigned long long)bd->bd_bh->b_blocknr); BUG(); } if (gfs2_withdrawing_or_withdrawn(sdp)) { fs_info(sdp, "GFS2:adding buf while withdrawn! 0x%llx\n", (unsigned long long)bd->bd_bh->b_blocknr); goto out_unlock; } if (unlikely(sb->s_writers.frozen == SB_FREEZE_COMPLETE)) { fs_info(sdp, "GFS2:adding buf while frozen\n"); withdraw = true; goto out_unlock; } gfs2_pin(sdp, bd->bd_bh); mh->__pad0 = cpu_to_be64(0); mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); list_add(&bd->bd_list, &tr->tr_buf); tr->tr_num_buf_new++; out_unlock: gfs2_log_unlock(sdp); if (withdraw) gfs2_assert_withdraw(sdp, 0); out: unlock_buffer(bh); } void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) { struct gfs2_trans *tr = current->journal_info; BUG_ON(!list_empty(&bd->bd_list)); gfs2_add_revoke(sdp, bd); set_bit(TR_TOUCHED, &tr->tr_flags); tr->tr_num_revoke++; } void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) { struct gfs2_bufdata *bd, *tmp; unsigned int n = len; gfs2_log_lock(sdp); list_for_each_entry_safe(bd, tmp, &sdp->sd_log_revokes, bd_list) { if ((bd->bd_blkno >= blkno) && (bd->bd_blkno < (blkno + len))) { list_del_init(&bd->bd_list); gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke); sdp->sd_log_num_revoke--; if (bd->bd_gl) gfs2_glock_remove_revoke(bd->bd_gl); kmem_cache_free(gfs2_bufdata_cachep, bd); gfs2_log_release_revokes(sdp, 1); if (--n == 0) break; } } gfs2_log_unlock(sdp); } void gfs2_trans_free(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { if (tr == NULL) return; gfs2_assert_warn(sdp, list_empty(&tr->tr_ail1_list)); gfs2_assert_warn(sdp, list_empty(&tr->tr_ail2_list)); gfs2_assert_warn(sdp, list_empty(&tr->tr_databuf)); gfs2_assert_warn(sdp, list_empty(&tr->tr_buf)); kmem_cache_free(gfs2_trans_cachep, tr); }
42 6406 1 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_ERR_H #define _LINUX_ERR_H #include <linux/compiler.h> #include <linux/types.h> #include <asm/errno.h> /* * Kernel pointers have redundant information, so we can use a * scheme where we can return either an error code or a normal * pointer with the same return value. * * This should be a per-architecture thing, to allow different * error and pointer decisions. */ #define MAX_ERRNO 4095 #ifndef __ASSEMBLY__ /** * IS_ERR_VALUE - Detect an error pointer. * @x: The pointer to check. * * Like IS_ERR(), but does not generate a compiler warning if result is unused. */ #define IS_ERR_VALUE(x) unlikely((unsigned long)(void *)(x) >= (unsigned long)-MAX_ERRNO) /** * ERR_PTR - Create an error pointer. * @error: A negative error code. * * Encodes @error into a pointer value. Users should consider the result * opaque and not assume anything about how the error is encoded. * * Return: A pointer with @error encoded within its value. */ static inline void * __must_check ERR_PTR(long error) { return (void *) error; } /* Return the pointer in the percpu address space. */ #define ERR_PTR_PCPU(error) ((void __percpu *)(unsigned long)ERR_PTR(error)) /* Cast an error pointer to __iomem. */ #define IOMEM_ERR_PTR(error) (__force void __iomem *)ERR_PTR(error) /** * PTR_ERR - Extract the error code from an error pointer. * @ptr: An error pointer. * Return: The error code within @ptr. */ static inline long __must_check PTR_ERR(__force const void *ptr) { return (long) ptr; } /* Read an error pointer from the percpu address space. */ #define PTR_ERR_PCPU(ptr) (PTR_ERR((const void *)(__force const unsigned long)(ptr))) /** * IS_ERR - Detect an error pointer. * @ptr: The pointer to check. * Return: true if @ptr is an error pointer, false otherwise. */ static inline bool __must_check IS_ERR(__force const void *ptr) { return IS_ERR_VALUE((unsigned long)ptr); } /* Read an error pointer from the percpu address space. */ #define IS_ERR_PCPU(ptr) (IS_ERR((const void *)(__force const unsigned long)(ptr))) /** * IS_ERR_OR_NULL - Detect an error pointer or a null pointer. * @ptr: The pointer to check. * * Like IS_ERR(), but also returns true for a null pointer. */ static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr) { return unlikely(!ptr) || IS_ERR_VALUE((unsigned long)ptr); } /** * ERR_CAST - Explicitly cast an error-valued pointer to another pointer type * @ptr: The pointer to cast. * * Explicitly cast an error-valued pointer to another pointer type in such a * way as to make it clear that's what's going on. */ static inline void * __must_check ERR_CAST(__force const void *ptr) { /* cast away the const */ return (void *) ptr; } /** * PTR_ERR_OR_ZERO - Extract the error code from a pointer if it has one. * @ptr: A potential error pointer. * * Convenience function that can be used inside a function that returns * an error code to propagate errors received as error pointers. * For example, ``return PTR_ERR_OR_ZERO(ptr);`` replaces: * * .. code-block:: c * * if (IS_ERR(ptr)) * return PTR_ERR(ptr); * else * return 0; * * Return: The error code within @ptr if it is an error pointer; 0 otherwise. */ static inline int __must_check PTR_ERR_OR_ZERO(__force const void *ptr) { if (IS_ERR(ptr)) return PTR_ERR(ptr); else return 0; } #endif #endif /* _LINUX_ERR_H */
1 1 1 1 1 1 1 1 1 1 4 4 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 /* * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/rculist.h> #include <linux/llist.h> #include "rds_single_path.h" #include "ib_mr.h" #include "rds.h" struct workqueue_struct *rds_ib_mr_wq; static void rds_ib_odp_mr_worker(struct work_struct *work); static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) { struct rds_ib_device *rds_ibdev; struct rds_ib_ipaddr *i_ipaddr; rcu_read_lock(); list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) { list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) { if (i_ipaddr->ipaddr == ipaddr) { refcount_inc(&rds_ibdev->refcount); rcu_read_unlock(); return rds_ibdev; } } } rcu_read_unlock(); return NULL; } static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) { struct rds_ib_ipaddr *i_ipaddr; i_ipaddr = kmalloc(sizeof *i_ipaddr, GFP_KERNEL); if (!i_ipaddr) return -ENOMEM; i_ipaddr->ipaddr = ipaddr; spin_lock_irq(&rds_ibdev->spinlock); list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list); spin_unlock_irq(&rds_ibdev->spinlock); return 0; } static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) { struct rds_ib_ipaddr *i_ipaddr; struct rds_ib_ipaddr *to_free = NULL; spin_lock_irq(&rds_ibdev->spinlock); list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) { if (i_ipaddr->ipaddr == ipaddr) { list_del_rcu(&i_ipaddr->list); to_free = i_ipaddr; break; } } spin_unlock_irq(&rds_ibdev->spinlock); if (to_free) kfree_rcu(to_free, rcu); } int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, struct in6_addr *ipaddr) { struct rds_ib_device *rds_ibdev_old; rds_ibdev_old = rds_ib_get_device(ipaddr->s6_addr32[3]); if (!rds_ibdev_old) return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]); if (rds_ibdev_old != rds_ibdev) { rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr->s6_addr32[3]); rds_ib_dev_put(rds_ibdev_old); return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]); } rds_ib_dev_put(rds_ibdev_old); return 0; } void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) { struct rds_ib_connection *ic = conn->c_transport_data; /* conn was previously on the nodev_conns_list */ spin_lock_irq(&ib_nodev_conns_lock); BUG_ON(list_empty(&ib_nodev_conns)); BUG_ON(list_empty(&ic->ib_node)); list_del(&ic->ib_node); spin_lock(&rds_ibdev->spinlock); list_add_tail(&ic->ib_node, &rds_ibdev->conn_list); spin_unlock(&rds_ibdev->spinlock); spin_unlock_irq(&ib_nodev_conns_lock); ic->rds_ibdev = rds_ibdev; refcount_inc(&rds_ibdev->refcount); } void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) { struct rds_ib_connection *ic = conn->c_transport_data; /* place conn on nodev_conns_list */ spin_lock(&ib_nodev_conns_lock); spin_lock_irq(&rds_ibdev->spinlock); BUG_ON(list_empty(&ic->ib_node)); list_del(&ic->ib_node); spin_unlock_irq(&rds_ibdev->spinlock); list_add_tail(&ic->ib_node, &ib_nodev_conns); spin_unlock(&ib_nodev_conns_lock); ic->rds_ibdev = NULL; rds_ib_dev_put(rds_ibdev); } void rds_ib_destroy_nodev_conns(void) { struct rds_ib_connection *ic, *_ic; LIST_HEAD(tmp_list); /* avoid calling conn_destroy with irqs off */ spin_lock_irq(&ib_nodev_conns_lock); list_splice(&ib_nodev_conns, &tmp_list); spin_unlock_irq(&ib_nodev_conns_lock); list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) rds_conn_destroy(ic->conn); } void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo) { struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool; iinfo->rdma_mr_max = pool_1m->max_items; iinfo->rdma_mr_size = pool_1m->max_pages; } #if IS_ENABLED(CONFIG_IPV6) void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds6_info_rdma_connection *iinfo6) { struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool; iinfo6->rdma_mr_max = pool_1m->max_items; iinfo6->rdma_mr_size = pool_1m->max_pages; } #endif struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool) { struct rds_ib_mr *ibmr = NULL; struct llist_node *ret; unsigned long flags; spin_lock_irqsave(&pool->clean_lock, flags); ret = llist_del_first(&pool->clean_list); spin_unlock_irqrestore(&pool->clean_lock, flags); if (ret) { ibmr = llist_entry(ret, struct rds_ib_mr, llnode); if (pool->pool_type == RDS_IB_MR_8K_POOL) rds_ib_stats_inc(s_ib_rdma_mr_8k_reused); else rds_ib_stats_inc(s_ib_rdma_mr_1m_reused); } return ibmr; } void rds_ib_sync_mr(void *trans_private, int direction) { struct rds_ib_mr *ibmr = trans_private; struct rds_ib_device *rds_ibdev = ibmr->device; if (ibmr->odp) return; switch (direction) { case DMA_FROM_DEVICE: ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg, ibmr->sg_dma_len, DMA_BIDIRECTIONAL); break; case DMA_TO_DEVICE: ib_dma_sync_sg_for_device(rds_ibdev->dev, ibmr->sg, ibmr->sg_dma_len, DMA_BIDIRECTIONAL); break; } } void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) { struct rds_ib_device *rds_ibdev = ibmr->device; if (ibmr->sg_dma_len) { ib_dma_unmap_sg(rds_ibdev->dev, ibmr->sg, ibmr->sg_len, DMA_BIDIRECTIONAL); ibmr->sg_dma_len = 0; } /* Release the s/g list */ if (ibmr->sg_len) { unsigned int i; for (i = 0; i < ibmr->sg_len; ++i) { struct page *page = sg_page(&ibmr->sg[i]); /* FIXME we need a way to tell a r/w MR * from a r/o MR */ WARN_ON(!page->mapping && irqs_disabled()); set_page_dirty(page); put_page(page); } kfree(ibmr->sg); ibmr->sg = NULL; ibmr->sg_len = 0; } } void rds_ib_teardown_mr(struct rds_ib_mr *ibmr) { unsigned int pinned = ibmr->sg_len; __rds_ib_teardown_mr(ibmr); if (pinned) { struct rds_ib_mr_pool *pool = ibmr->pool; atomic_sub(pinned, &pool->free_pinned); } } static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int free_all) { unsigned int item_count; item_count = atomic_read(&pool->item_count); if (free_all) return item_count; return 0; } /* * given an llist of mrs, put them all into the list_head for more processing */ static unsigned int llist_append_to_list(struct llist_head *llist, struct list_head *list) { struct rds_ib_mr *ibmr; struct llist_node *node; struct llist_node *next; unsigned int count = 0; node = llist_del_all(llist); while (node) { next = node->next; ibmr = llist_entry(node, struct rds_ib_mr, llnode); list_add_tail(&ibmr->unmap_list, list); node = next; count++; } return count; } /* * this takes a list head of mrs and turns it into linked llist nodes * of clusters. Each cluster has linked llist nodes of * MR_CLUSTER_SIZE mrs that are ready for reuse. */ static void list_to_llist_nodes(struct list_head *list, struct llist_node **nodes_head, struct llist_node **nodes_tail) { struct rds_ib_mr *ibmr; struct llist_node *cur = NULL; struct llist_node **next = nodes_head; list_for_each_entry(ibmr, list, unmap_list) { cur = &ibmr->llnode; *next = cur; next = &cur->next; } *next = NULL; *nodes_tail = cur; } /* * Flush our pool of MRs. * At a minimum, all currently unused MRs are unmapped. * If the number of MRs allocated exceeds the limit, we also try * to free as many MRs as needed to get back to this limit. */ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **ibmr_ret) { struct rds_ib_mr *ibmr; struct llist_node *clean_nodes; struct llist_node *clean_tail; LIST_HEAD(unmap_list); unsigned long unpinned = 0; unsigned int nfreed = 0, dirty_to_clean = 0, free_goal; if (pool->pool_type == RDS_IB_MR_8K_POOL) rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_flush); else rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_flush); if (ibmr_ret) { DEFINE_WAIT(wait); while (!mutex_trylock(&pool->flush_lock)) { ibmr = rds_ib_reuse_mr(pool); if (ibmr) { *ibmr_ret = ibmr; finish_wait(&pool->flush_wait, &wait); goto out_nolock; } prepare_to_wait(&pool->flush_wait, &wait, TASK_UNINTERRUPTIBLE); if (llist_empty(&pool->clean_list)) schedule(); ibmr = rds_ib_reuse_mr(pool); if (ibmr) { *ibmr_ret = ibmr; finish_wait(&pool->flush_wait, &wait); goto out_nolock; } } finish_wait(&pool->flush_wait, &wait); } else mutex_lock(&pool->flush_lock); if (ibmr_ret) { ibmr = rds_ib_reuse_mr(pool); if (ibmr) { *ibmr_ret = ibmr; goto out; } } /* Get the list of all MRs to be dropped. Ordering matters - * we want to put drop_list ahead of free_list. */ dirty_to_clean = llist_append_to_list(&pool->drop_list, &unmap_list); dirty_to_clean += llist_append_to_list(&pool->free_list, &unmap_list); if (free_all) { unsigned long flags; spin_lock_irqsave(&pool->clean_lock, flags); llist_append_to_list(&pool->clean_list, &unmap_list); spin_unlock_irqrestore(&pool->clean_lock, flags); } free_goal = rds_ib_flush_goal(pool, free_all); if (list_empty(&unmap_list)) goto out; rds_ib_unreg_frmr(&unmap_list, &nfreed, &unpinned, free_goal); if (!list_empty(&unmap_list)) { unsigned long flags; list_to_llist_nodes(&unmap_list, &clean_nodes, &clean_tail); if (ibmr_ret) { *ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode); clean_nodes = clean_nodes->next; } /* more than one entry in llist nodes */ if (clean_nodes) { spin_lock_irqsave(&pool->clean_lock, flags); llist_add_batch(clean_nodes, clean_tail, &pool->clean_list); spin_unlock_irqrestore(&pool->clean_lock, flags); } } atomic_sub(unpinned, &pool->free_pinned); atomic_sub(dirty_to_clean, &pool->dirty_count); atomic_sub(nfreed, &pool->item_count); out: mutex_unlock(&pool->flush_lock); if (waitqueue_active(&pool->flush_wait)) wake_up(&pool->flush_wait); out_nolock: return 0; } struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool) { struct rds_ib_mr *ibmr = NULL; int iter = 0; while (1) { ibmr = rds_ib_reuse_mr(pool); if (ibmr) return ibmr; if (atomic_inc_return(&pool->item_count) <= pool->max_items) break; atomic_dec(&pool->item_count); if (++iter > 2) { if (pool->pool_type == RDS_IB_MR_8K_POOL) rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted); else rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted); break; } /* We do have some empty MRs. Flush them out. */ if (pool->pool_type == RDS_IB_MR_8K_POOL) rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_wait); else rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_wait); rds_ib_flush_mr_pool(pool, 0, &ibmr); if (ibmr) return ibmr; } return NULL; } static void rds_ib_mr_pool_flush_worker(struct work_struct *work) { struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work); rds_ib_flush_mr_pool(pool, 0, NULL); } void rds_ib_free_mr(void *trans_private, int invalidate) { struct rds_ib_mr *ibmr = trans_private; struct rds_ib_mr_pool *pool = ibmr->pool; struct rds_ib_device *rds_ibdev = ibmr->device; rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); if (ibmr->odp) { /* A MR created and marked as use_once. We use delayed work, * because there is a change that we are in interrupt and can't * call to ib_dereg_mr() directly. */ INIT_DELAYED_WORK(&ibmr->work, rds_ib_odp_mr_worker); queue_delayed_work(rds_ib_mr_wq, &ibmr->work, 0); return; } /* Return it to the pool's free list */ rds_ib_free_frmr_list(ibmr); atomic_add(ibmr->sg_len, &pool->free_pinned); atomic_inc(&pool->dirty_count); /* If we've pinned too many pages, request a flush */ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || atomic_read(&pool->dirty_count) >= pool->max_items / 5) queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10); if (invalidate) { if (likely(!in_interrupt())) { rds_ib_flush_mr_pool(pool, 0, NULL); } else { /* We get here if the user created a MR marked * as use_once and invalidate at the same time. */ queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10); } } rds_ib_dev_put(rds_ibdev); } void rds_ib_flush_mrs(void) { struct rds_ib_device *rds_ibdev; down_read(&rds_ib_devices_lock); list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { if (rds_ibdev->mr_8k_pool) rds_ib_flush_mr_pool(rds_ibdev->mr_8k_pool, 0, NULL); if (rds_ibdev->mr_1m_pool) rds_ib_flush_mr_pool(rds_ibdev->mr_1m_pool, 0, NULL); } up_read(&rds_ib_devices_lock); } u32 rds_ib_get_lkey(void *trans_private) { struct rds_ib_mr *ibmr = trans_private; return ibmr->u.mr->lkey; } void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, struct rds_sock *rs, u32 *key_ret, struct rds_connection *conn, u64 start, u64 length, int need_odp) { struct rds_ib_device *rds_ibdev; struct rds_ib_mr *ibmr = NULL; struct rds_ib_connection *ic = NULL; int ret; rds_ibdev = rds_ib_get_device(rs->rs_bound_addr.s6_addr32[3]); if (!rds_ibdev) { ret = -ENODEV; goto out; } if (need_odp == ODP_ZEROBASED || need_odp == ODP_VIRTUAL) { u64 virt_addr = need_odp == ODP_ZEROBASED ? 0 : start; int access_flags = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_ON_DEMAND); struct ib_sge sge = {}; struct ib_mr *ib_mr; if (!rds_ibdev->odp_capable) { ret = -EOPNOTSUPP; goto out; } ib_mr = ib_reg_user_mr(rds_ibdev->pd, start, length, virt_addr, access_flags); if (IS_ERR(ib_mr)) { rdsdebug("rds_ib_get_user_mr returned %d\n", IS_ERR(ib_mr)); ret = PTR_ERR(ib_mr); goto out; } if (key_ret) *key_ret = ib_mr->rkey; ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); if (!ibmr) { ib_dereg_mr(ib_mr); ret = -ENOMEM; goto out; } ibmr->u.mr = ib_mr; ibmr->odp = 1; sge.addr = virt_addr; sge.length = length; sge.lkey = ib_mr->lkey; ib_advise_mr(rds_ibdev->pd, IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE, IB_UVERBS_ADVISE_MR_FLAG_FLUSH, &sge, 1); return ibmr; } if (conn) ic = conn->c_transport_data; if (!rds_ibdev->mr_8k_pool || !rds_ibdev->mr_1m_pool) { ret = -ENODEV; goto out; } ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret); if (IS_ERR(ibmr)) { ret = PTR_ERR(ibmr); pr_warn("RDS/IB: rds_ib_get_mr failed (errno=%d)\n", ret); } else { return ibmr; } out: if (rds_ibdev) rds_ib_dev_put(rds_ibdev); return ERR_PTR(ret); } void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) { cancel_delayed_work_sync(&pool->flush_worker); rds_ib_flush_mr_pool(pool, 1, NULL); WARN_ON(atomic_read(&pool->item_count)); WARN_ON(atomic_read(&pool->free_pinned)); kfree(pool); } struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, int pool_type) { struct rds_ib_mr_pool *pool; pool = kzalloc(sizeof(*pool), GFP_KERNEL); if (!pool) return ERR_PTR(-ENOMEM); pool->pool_type = pool_type; init_llist_head(&pool->free_list); init_llist_head(&pool->drop_list); init_llist_head(&pool->clean_list); spin_lock_init(&pool->clean_lock); mutex_init(&pool->flush_lock); init_waitqueue_head(&pool->flush_wait); INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); if (pool_type == RDS_IB_MR_1M_POOL) { /* +1 allows for unaligned MRs */ pool->max_pages = RDS_MR_1M_MSG_SIZE + 1; pool->max_items = rds_ibdev->max_1m_mrs; } else { /* pool_type == RDS_IB_MR_8K_POOL */ pool->max_pages = RDS_MR_8K_MSG_SIZE + 1; pool->max_items = rds_ibdev->max_8k_mrs; } pool->max_free_pinned = pool->max_items * pool->max_pages / 4; pool->max_items_soft = rds_ibdev->max_mrs * 3 / 4; return pool; } int rds_ib_mr_init(void) { rds_ib_mr_wq = alloc_workqueue("rds_mr_flushd", WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!rds_ib_mr_wq) return -ENOMEM; return 0; } /* By the time this is called all the IB devices should have been torn down and * had their pools freed. As each pool is freed its work struct is waited on, * so the pool flushing work queue should be idle by the time we get here. */ void rds_ib_mr_exit(void) { destroy_workqueue(rds_ib_mr_wq); } static void rds_ib_odp_mr_worker(struct work_struct *work) { struct rds_ib_mr *ibmr; ibmr = container_of(work, struct rds_ib_mr, work.work); ib_dereg_mr(ibmr->u.mr); kfree(ibmr); }
5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 // SPDX-License-Identifier: GPL-2.0-or-later /* * Force feedback support for Logitech Gaming Wheels * * Including G27, G25, DFP, DFGT, FFEX, Momo, Momo2 & * Speed Force Wireless (WiiWheel) * * Copyright (c) 2010 Simon Wood <simon@mungewell.org> */ /* */ #include <linux/input.h> #include <linux/usb.h> #include <linux/hid.h> #include "usbhid/usbhid.h" #include "hid-lg.h" #include "hid-lg4ff.h" #include "hid-ids.h" #define LG4FF_MMODE_IS_MULTIMODE 0 #define LG4FF_MMODE_SWITCHED 1 #define LG4FF_MMODE_NOT_MULTIMODE 2 #define LG4FF_MODE_NATIVE_IDX 0 #define LG4FF_MODE_DFEX_IDX 1 #define LG4FF_MODE_DFP_IDX 2 #define LG4FF_MODE_G25_IDX 3 #define LG4FF_MODE_DFGT_IDX 4 #define LG4FF_MODE_G27_IDX 5 #define LG4FF_MODE_G29_IDX 6 #define LG4FF_MODE_MAX_IDX 7 #define LG4FF_MODE_NATIVE BIT(LG4FF_MODE_NATIVE_IDX) #define LG4FF_MODE_DFEX BIT(LG4FF_MODE_DFEX_IDX) #define LG4FF_MODE_DFP BIT(LG4FF_MODE_DFP_IDX) #define LG4FF_MODE_G25 BIT(LG4FF_MODE_G25_IDX) #define LG4FF_MODE_DFGT BIT(LG4FF_MODE_DFGT_IDX) #define LG4FF_MODE_G27 BIT(LG4FF_MODE_G27_IDX) #define LG4FF_MODE_G29 BIT(LG4FF_MODE_G29_IDX) #define LG4FF_DFEX_TAG "DF-EX" #define LG4FF_DFEX_NAME "Driving Force / Formula EX" #define LG4FF_DFP_TAG "DFP" #define LG4FF_DFP_NAME "Driving Force Pro" #define LG4FF_G25_TAG "G25" #define LG4FF_G25_NAME "G25 Racing Wheel" #define LG4FF_G27_TAG "G27" #define LG4FF_G27_NAME "G27 Racing Wheel" #define LG4FF_G29_TAG "G29" #define LG4FF_G29_NAME "G29 Racing Wheel" #define LG4FF_DFGT_TAG "DFGT" #define LG4FF_DFGT_NAME "Driving Force GT" #define LG4FF_FFEX_REV_MAJ 0x21 #define LG4FF_FFEX_REV_MIN 0x00 static void lg4ff_set_range_dfp(struct hid_device *hid, u16 range); static void lg4ff_set_range_g25(struct hid_device *hid, u16 range); struct lg4ff_wheel_data { const u32 product_id; u16 combine; u16 range; const u16 min_range; const u16 max_range; #ifdef CONFIG_LEDS_CLASS u8 led_state; struct led_classdev *led[5]; #endif const u32 alternate_modes; const char * const real_tag; const char * const real_name; const u16 real_product_id; void (*set_range)(struct hid_device *hid, u16 range); }; struct lg4ff_device_entry { spinlock_t report_lock; /* Protect output HID report */ struct hid_report *report; struct lg4ff_wheel_data wdata; }; static const signed short lg4ff_wheel_effects[] = { FF_CONSTANT, FF_AUTOCENTER, -1 }; static const signed short no_wheel_effects[] = { -1 }; struct lg4ff_wheel { const u32 product_id; const signed short *ff_effects; const u16 min_range; const u16 max_range; void (*set_range)(struct hid_device *hid, u16 range); }; struct lg4ff_compat_mode_switch { const u8 cmd_count; /* Number of commands to send */ const u8 cmd[]; }; struct lg4ff_wheel_ident_info { const u32 modes; const u16 mask; const u16 result; const u16 real_product_id; }; struct lg4ff_multimode_wheel { const u16 product_id; const u32 alternate_modes; const char *real_tag; const char *real_name; }; struct lg4ff_alternate_mode { const u16 product_id; const char *tag; const char *name; }; static const struct lg4ff_wheel lg4ff_devices[] = { {USB_DEVICE_ID_LOGITECH_WINGMAN_FG, no_wheel_effects, 40, 180, NULL}, {USB_DEVICE_ID_LOGITECH_WINGMAN_FFG, lg4ff_wheel_effects, 40, 180, NULL}, {USB_DEVICE_ID_LOGITECH_WHEEL, lg4ff_wheel_effects, 40, 270, NULL}, {USB_DEVICE_ID_LOGITECH_MOMO_WHEEL, lg4ff_wheel_effects, 40, 270, NULL}, {USB_DEVICE_ID_LOGITECH_DFP_WHEEL, lg4ff_wheel_effects, 40, 900, lg4ff_set_range_dfp}, {USB_DEVICE_ID_LOGITECH_G25_WHEEL, lg4ff_wheel_effects, 40, 900, lg4ff_set_range_g25}, {USB_DEVICE_ID_LOGITECH_DFGT_WHEEL, lg4ff_wheel_effects, 40, 900, lg4ff_set_range_g25}, {USB_DEVICE_ID_LOGITECH_G27_WHEEL, lg4ff_wheel_effects, 40, 900, lg4ff_set_range_g25}, {USB_DEVICE_ID_LOGITECH_G29_WHEEL, lg4ff_wheel_effects, 40, 900, lg4ff_set_range_g25}, {USB_DEVICE_ID_LOGITECH_MOMO_WHEEL2, lg4ff_wheel_effects, 40, 270, NULL}, {USB_DEVICE_ID_LOGITECH_WII_WHEEL, lg4ff_wheel_effects, 40, 270, NULL} }; static const struct lg4ff_multimode_wheel lg4ff_multimode_wheels[] = { {USB_DEVICE_ID_LOGITECH_DFP_WHEEL, LG4FF_MODE_NATIVE | LG4FF_MODE_DFP | LG4FF_MODE_DFEX, LG4FF_DFP_TAG, LG4FF_DFP_NAME}, {USB_DEVICE_ID_LOGITECH_G25_WHEEL, LG4FF_MODE_NATIVE | LG4FF_MODE_G25 | LG4FF_MODE_DFP | LG4FF_MODE_DFEX, LG4FF_G25_TAG, LG4FF_G25_NAME}, {USB_DEVICE_ID_LOGITECH_DFGT_WHEEL, LG4FF_MODE_NATIVE | LG4FF_MODE_DFGT | LG4FF_MODE_DFP | LG4FF_MODE_DFEX, LG4FF_DFGT_TAG, LG4FF_DFGT_NAME}, {USB_DEVICE_ID_LOGITECH_G27_WHEEL, LG4FF_MODE_NATIVE | LG4FF_MODE_G27 | LG4FF_MODE_G25 | LG4FF_MODE_DFP | LG4FF_MODE_DFEX, LG4FF_G27_TAG, LG4FF_G27_NAME}, {USB_DEVICE_ID_LOGITECH_G29_WHEEL, LG4FF_MODE_NATIVE | LG4FF_MODE_G29 | LG4FF_MODE_G27 | LG4FF_MODE_G25 | LG4FF_MODE_DFGT | LG4FF_MODE_DFP | LG4FF_MODE_DFEX, LG4FF_G29_TAG, LG4FF_G29_NAME}, }; static const struct lg4ff_alternate_mode lg4ff_alternate_modes[] = { [LG4FF_MODE_NATIVE_IDX] = {0, "native", ""}, [LG4FF_MODE_DFEX_IDX] = {USB_DEVICE_ID_LOGITECH_WHEEL, LG4FF_DFEX_TAG, LG4FF_DFEX_NAME}, [LG4FF_MODE_DFP_IDX] = {USB_DEVICE_ID_LOGITECH_DFP_WHEEL, LG4FF_DFP_TAG, LG4FF_DFP_NAME}, [LG4FF_MODE_G25_IDX] = {USB_DEVICE_ID_LOGITECH_G25_WHEEL, LG4FF_G25_TAG, LG4FF_G25_NAME}, [LG4FF_MODE_DFGT_IDX] = {USB_DEVICE_ID_LOGITECH_DFGT_WHEEL, LG4FF_DFGT_TAG, LG4FF_DFGT_NAME}, [LG4FF_MODE_G27_IDX] = {USB_DEVICE_ID_LOGITECH_G27_WHEEL, LG4FF_G27_TAG, LG4FF_G27_NAME}, [LG4FF_MODE_G29_IDX] = {USB_DEVICE_ID_LOGITECH_G29_WHEEL, LG4FF_G29_TAG, LG4FF_G29_NAME}, }; /* Multimode wheel identificators */ static const struct lg4ff_wheel_ident_info lg4ff_dfp_ident_info = { LG4FF_MODE_DFP | LG4FF_MODE_DFEX, 0xf000, 0x1000, USB_DEVICE_ID_LOGITECH_DFP_WHEEL }; static const struct lg4ff_wheel_ident_info lg4ff_g25_ident_info = { LG4FF_MODE_G25 | LG4FF_MODE_DFP | LG4FF_MODE_DFEX, 0xff00, 0x1200, USB_DEVICE_ID_LOGITECH_G25_WHEEL }; static const struct lg4ff_wheel_ident_info lg4ff_g27_ident_info = { LG4FF_MODE_G27 | LG4FF_MODE_G25 | LG4FF_MODE_DFP | LG4FF_MODE_DFEX, 0xfff0, 0x1230, USB_DEVICE_ID_LOGITECH_G27_WHEEL }; static const struct lg4ff_wheel_ident_info lg4ff_dfgt_ident_info = { LG4FF_MODE_DFGT | LG4FF_MODE_DFP | LG4FF_MODE_DFEX, 0xff00, 0x1300, USB_DEVICE_ID_LOGITECH_DFGT_WHEEL }; static const struct lg4ff_wheel_ident_info lg4ff_g29_ident_info = { LG4FF_MODE_G29 | LG4FF_MODE_G27 | LG4FF_MODE_G25 | LG4FF_MODE_DFGT | LG4FF_MODE_DFP | LG4FF_MODE_DFEX, 0xfff8, 0x1350, USB_DEVICE_ID_LOGITECH_G29_WHEEL }; static const struct lg4ff_wheel_ident_info lg4ff_g29_ident_info2 = { LG4FF_MODE_G29 | LG4FF_MODE_G27 | LG4FF_MODE_G25 | LG4FF_MODE_DFGT | LG4FF_MODE_DFP | LG4FF_MODE_DFEX, 0xff00, 0x8900, USB_DEVICE_ID_LOGITECH_G29_WHEEL }; /* Multimode wheel identification checklists */ static const struct lg4ff_wheel_ident_info *lg4ff_main_checklist[] = { &lg4ff_g29_ident_info, &lg4ff_g29_ident_info2, &lg4ff_dfgt_ident_info, &lg4ff_g27_ident_info, &lg4ff_g25_ident_info, &lg4ff_dfp_ident_info }; /* Compatibility mode switching commands */ /* EXT_CMD9 - Understood by G27 and DFGT */ static const struct lg4ff_compat_mode_switch lg4ff_mode_switch_ext09_dfex = { 2, {0xf8, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, /* Revert mode upon USB reset */ 0xf8, 0x09, 0x00, 0x01, 0x00, 0x00, 0x00} /* Switch mode to DF-EX with detach */ }; static const struct lg4ff_compat_mode_switch lg4ff_mode_switch_ext09_dfp = { 2, {0xf8, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, /* Revert mode upon USB reset */ 0xf8, 0x09, 0x01, 0x01, 0x00, 0x00, 0x00} /* Switch mode to DFP with detach */ }; static const struct lg4ff_compat_mode_switch lg4ff_mode_switch_ext09_g25 = { 2, {0xf8, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, /* Revert mode upon USB reset */ 0xf8, 0x09, 0x02, 0x01, 0x00, 0x00, 0x00} /* Switch mode to G25 with detach */ }; static const struct lg4ff_compat_mode_switch lg4ff_mode_switch_ext09_dfgt = { 2, {0xf8, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, /* Revert mode upon USB reset */ 0xf8, 0x09, 0x03, 0x01, 0x00, 0x00, 0x00} /* Switch mode to DFGT with detach */ }; static const struct lg4ff_compat_mode_switch lg4ff_mode_switch_ext09_g27 = { 2, {0xf8, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, /* Revert mode upon USB reset */ 0xf8, 0x09, 0x04, 0x01, 0x00, 0x00, 0x00} /* Switch mode to G27 with detach */ }; static const struct lg4ff_compat_mode_switch lg4ff_mode_switch_ext09_g29 = { 2, {0xf8, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, /* Revert mode upon USB reset */ 0xf8, 0x09, 0x05, 0x01, 0x01, 0x00, 0x00} /* Switch mode to G29 with detach */ }; /* EXT_CMD1 - Understood by DFP, G25, G27 and DFGT */ static const struct lg4ff_compat_mode_switch lg4ff_mode_switch_ext01_dfp = { 1, {0xf8, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00} }; /* EXT_CMD16 - Understood by G25 and G27 */ static const struct lg4ff_compat_mode_switch lg4ff_mode_switch_ext16_g25 = { 1, {0xf8, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00} }; /* Recalculates X axis value accordingly to currently selected range */ static s32 lg4ff_adjust_dfp_x_axis(s32 value, u16 range) { u16 max_range; s32 new_value; if (range == 900) return value; else if (range == 200) return value; else if (range < 200) max_range = 200; else max_range = 900; new_value = 8192 + mult_frac(value - 8192, max_range, range); if (new_value < 0) return 0; else if (new_value > 16383) return 16383; else return new_value; } int lg4ff_adjust_input_event(struct hid_device *hid, struct hid_field *field, struct hid_usage *usage, s32 value, struct lg_drv_data *drv_data) { struct lg4ff_device_entry *entry = drv_data->device_props; s32 new_value = 0; if (!entry) { hid_err(hid, "Device properties not found"); return 0; } switch (entry->wdata.product_id) { case USB_DEVICE_ID_LOGITECH_DFP_WHEEL: switch (usage->code) { case ABS_X: new_value = lg4ff_adjust_dfp_x_axis(value, entry->wdata.range); input_event(field->hidinput->input, usage->type, usage->code, new_value); return 1; default: return 0; } default: return 0; } } int lg4ff_raw_event(struct hid_device *hdev, struct hid_report *report, u8 *rd, int size, struct lg_drv_data *drv_data) { int offset; struct lg4ff_device_entry *entry = drv_data->device_props; if (!entry) return 0; /* adjust HID report present combined pedals data */ if (entry->wdata.combine) { switch (entry->wdata.product_id) { case USB_DEVICE_ID_LOGITECH_WHEEL: rd[5] = rd[3]; rd[6] = 0x7F; return 1; case USB_DEVICE_ID_LOGITECH_WINGMAN_FG: case USB_DEVICE_ID_LOGITECH_WINGMAN_FFG: case USB_DEVICE_ID_LOGITECH_MOMO_WHEEL: case USB_DEVICE_ID_LOGITECH_MOMO_WHEEL2: rd[4] = rd[3]; rd[5] = 0x7F; return 1; case USB_DEVICE_ID_LOGITECH_DFP_WHEEL: rd[5] = rd[4]; rd[6] = 0x7F; return 1; case USB_DEVICE_ID_LOGITECH_G25_WHEEL: case USB_DEVICE_ID_LOGITECH_G27_WHEEL: offset = 5; break; case USB_DEVICE_ID_LOGITECH_DFGT_WHEEL: case USB_DEVICE_ID_LOGITECH_G29_WHEEL: offset = 6; break; case USB_DEVICE_ID_LOGITECH_WII_WHEEL: offset = 3; break; default: return 0; } /* Compute a combined axis when wheel does not supply it */ rd[offset] = (0xFF + rd[offset] - rd[offset+1]) >> 1; rd[offset+1] = 0x7F; return 1; } return 0; } static void lg4ff_init_wheel_data(struct lg4ff_wheel_data * const wdata, const struct lg4ff_wheel *wheel, const struct lg4ff_multimode_wheel *mmode_wheel, const u16 real_product_id) { u32 alternate_modes = 0; const char *real_tag = NULL; const char *real_name = NULL; if (mmode_wheel) { alternate_modes = mmode_wheel->alternate_modes; real_tag = mmode_wheel->real_tag; real_name = mmode_wheel->real_name; } { struct lg4ff_wheel_data t_wdata = { .product_id = wheel->product_id, .real_product_id = real_product_id, .combine = 0, .min_range = wheel->min_range, .max_range = wheel->max_range, .set_range = wheel->set_range, .alternate_modes = alternate_modes, .real_tag = real_tag, .real_name = real_name }; memcpy(wdata, &t_wdata, sizeof(t_wdata)); } } static int lg4ff_play(struct input_dev *dev, void *data, struct ff_effect *effect) { struct hid_device *hid = input_get_drvdata(dev); struct lg4ff_device_entry *entry; struct lg_drv_data *drv_data; unsigned long flags; s32 *value; int x; drv_data = hid_get_drvdata(hid); if (!drv_data) { hid_err(hid, "Private driver data not found!\n"); return -EINVAL; } entry = drv_data->device_props; if (!entry) { hid_err(hid, "Device properties not found!\n"); return -EINVAL; } value = entry->report->field[0]->value; #define CLAMP(x) do { if (x < 0) x = 0; else if (x > 0xff) x = 0xff; } while (0) switch (effect->type) { case FF_CONSTANT: x = effect->u.ramp.start_level + 0x80; /* 0x80 is no force */ CLAMP(x); spin_lock_irqsave(&entry->report_lock, flags); if (x == 0x80) { /* De-activate force in slot-1*/ value[0] = 0x13; value[1] = 0x00; value[2] = 0x00; value[3] = 0x00; value[4] = 0x00; value[5] = 0x00; value[6] = 0x00; hid_hw_request(hid, entry->report, HID_REQ_SET_REPORT); spin_unlock_irqrestore(&entry->report_lock, flags); return 0; } value[0] = 0x11; /* Slot 1 */ value[1] = 0x08; value[2] = x; value[3] = 0x80; value[4] = 0x00; value[5] = 0x00; value[6] = 0x00; hid_hw_request(hid, entry->report, HID_REQ_SET_REPORT); spin_unlock_irqrestore(&entry->report_lock, flags); break; } return 0; } /* Sends default autocentering command compatible with * all wheels except Formula Force EX */ static void lg4ff_set_autocenter_default(struct input_dev *dev, u16 magnitude) { struct hid_device *hid = input_get_drvdata(dev); s32 *value; u32 expand_a, expand_b; struct lg4ff_device_entry *entry; struct lg_drv_data *drv_data; unsigned long flags; drv_data = hid_get_drvdata(hid); if (!drv_data) { hid_err(hid, "Private driver data not found!\n"); return; } entry = drv_data->device_props; if (!entry) { hid_err(hid, "Device properties not found!\n"); return; } value = entry->report->field[0]->value; /* De-activate Auto-Center */ spin_lock_irqsave(&entry->report_lock, flags); if (magnitude == 0) { value[0] = 0xf5; value[1] = 0x00; value[2] = 0x00; value[3] = 0x00; value[4] = 0x00; value[5] = 0x00; value[6] = 0x00; hid_hw_request(hid, entry->report, HID_REQ_SET_REPORT); spin_unlock_irqrestore(&entry->report_lock, flags); return; } if (magnitude <= 0xaaaa) { expand_a = 0x0c * magnitude; expand_b = 0x80 * magnitude; } else { expand_a = (0x0c * 0xaaaa) + 0x06 * (magnitude - 0xaaaa); expand_b = (0x80 * 0xaaaa) + 0xff * (magnitude - 0xaaaa); } /* Adjust for non-MOMO wheels */ switch (entry->wdata.product_id) { case USB_DEVICE_ID_LOGITECH_MOMO_WHEEL: case USB_DEVICE_ID_LOGITECH_MOMO_WHEEL2: break; default: expand_a = expand_a >> 1; break; } value[0] = 0xfe; value[1] = 0x0d; value[2] = expand_a / 0xaaaa; value[3] = expand_a / 0xaaaa; value[4] = expand_b / 0xaaaa; value[5] = 0x00; value[6] = 0x00; hid_hw_request(hid, entry->report, HID_REQ_SET_REPORT); /* Activate Auto-Center */ value[0] = 0x14; value[1] = 0x00; value[2] = 0x00; value[3] = 0x00; value[4] = 0x00; value[5] = 0x00; value[6] = 0x00; hid_hw_request(hid, entry->report, HID_REQ_SET_REPORT); spin_unlock_irqrestore(&entry->report_lock, flags); } /* Sends autocentering command compatible with Formula Force EX */ static void lg4ff_set_autocenter_ffex(struct input_dev *dev, u16 magnitude) { struct hid_device *hid = input_get_drvdata(dev); struct lg4ff_device_entry *entry; struct lg_drv_data *drv_data; unsigned long flags; s32 *value; magnitude = magnitude * 90 / 65535; drv_data = hid_get_drvdata(hid); if (!drv_data) { hid_err(hid, "Private driver data not found!\n"); return; } entry = drv_data->device_props; if (!entry) { hid_err(hid, "Device properties not found!\n"); return; } value = entry->report->field[0]->value; spin_lock_irqsave(&entry->report_lock, flags); value[0] = 0xfe; value[1] = 0x03; value[2] = magnitude >> 14; value[3] = magnitude >> 14; value[4] = magnitude; value[5] = 0x00; value[6] = 0x00; hid_hw_request(hid, entry->report, HID_REQ_SET_REPORT); spin_unlock_irqrestore(&entry->report_lock, flags); } /* Sends command to set range compatible with G25/G27/Driving Force GT */ static void lg4ff_set_range_g25(struct hid_device *hid, u16 range) { struct lg4ff_device_entry *entry; struct lg_drv_data *drv_data; unsigned long flags; s32 *value; drv_data = hid_get_drvdata(hid); if (!drv_data) { hid_err(hid, "Private driver data not found!\n"); return; } entry = drv_data->device_props; if (!entry) { hid_err(hid, "Device properties not found!\n"); return; } value = entry->report->field[0]->value; dbg_hid("G25/G27/DFGT: setting range to %u\n", range); spin_lock_irqsave(&entry->report_lock, flags); value[0] = 0xf8; value[1] = 0x81; value[2] = range & 0x00ff; value[3] = (range & 0xff00) >> 8; value[4] = 0x00; value[5] = 0x00; value[6] = 0x00; hid_hw_request(hid, entry->report, HID_REQ_SET_REPORT); spin_unlock_irqrestore(&entry->report_lock, flags); } /* Sends commands to set range compatible with Driving Force Pro wheel */ static void lg4ff_set_range_dfp(struct hid_device *hid, u16 range) { struct lg4ff_device_entry *entry; struct lg_drv_data *drv_data; unsigned long flags; int start_left, start_right, full_range; s32 *value; drv_data = hid_get_drvdata(hid); if (!drv_data) { hid_err(hid, "Private driver data not found!\n"); return; } entry = drv_data->device_props; if (!entry) { hid_err(hid, "Device properties not found!\n"); return; } value = entry->report->field[0]->value; dbg_hid("Driving Force Pro: setting range to %u\n", range); /* Prepare "coarse" limit command */ spin_lock_irqsave(&entry->report_lock, flags); value[0] = 0xf8; value[1] = 0x00; /* Set later */ value[2] = 0x00; value[3] = 0x00; value[4] = 0x00; value[5] = 0x00; value[6] = 0x00; if (range > 200) { value[1] = 0x03; full_range = 900; } else { value[1] = 0x02; full_range = 200; } hid_hw_request(hid, entry->report, HID_REQ_SET_REPORT); /* Prepare "fine" limit command */ value[0] = 0x81; value[1] = 0x0b; value[2] = 0x00; value[3] = 0x00; value[4] = 0x00; value[5] = 0x00; value[6] = 0x00; if (range == 200 || range == 900) { /* Do not apply any fine limit */ hid_hw_request(hid, entry->report, HID_REQ_SET_REPORT); spin_unlock_irqrestore(&entry->report_lock, flags); return; } /* Construct fine limit command */ start_left = (((full_range - range + 1) * 2047) / full_range); start_right = 0xfff - start_left; value[2] = start_left >> 4; value[3] = start_right >> 4; value[4] = 0xff; value[5] = (start_right & 0xe) << 4 | (start_left & 0xe); value[6] = 0xff; hid_hw_request(hid, entry->report, HID_REQ_SET_REPORT); spin_unlock_irqrestore(&entry->report_lock, flags); } static const struct lg4ff_compat_mode_switch *lg4ff_get_mode_switch_command(const u16 real_product_id, const u16 target_product_id) { switch (real_product_id) { case USB_DEVICE_ID_LOGITECH_DFP_WHEEL: switch (target_product_id) { case USB_DEVICE_ID_LOGITECH_DFP_WHEEL: return &lg4ff_mode_switch_ext01_dfp; /* DFP can only be switched to its native mode */ default: return NULL; } break; case USB_DEVICE_ID_LOGITECH_G25_WHEEL: switch (target_product_id) { case USB_DEVICE_ID_LOGITECH_DFP_WHEEL: return &lg4ff_mode_switch_ext01_dfp; case USB_DEVICE_ID_LOGITECH_G25_WHEEL: return &lg4ff_mode_switch_ext16_g25; /* G25 can only be switched to DFP mode or its native mode */ default: return NULL; } break; case USB_DEVICE_ID_LOGITECH_G27_WHEEL: switch (target_product_id) { case USB_DEVICE_ID_LOGITECH_WHEEL: return &lg4ff_mode_switch_ext09_dfex; case USB_DEVICE_ID_LOGITECH_DFP_WHEEL: return &lg4ff_mode_switch_ext09_dfp; case USB_DEVICE_ID_LOGITECH_G25_WHEEL: return &lg4ff_mode_switch_ext09_g25; case USB_DEVICE_ID_LOGITECH_G27_WHEEL: return &lg4ff_mode_switch_ext09_g27; /* G27 can only be switched to DF-EX, DFP, G25 or its native mode */ default: return NULL; } break; case USB_DEVICE_ID_LOGITECH_G29_WHEEL: switch (target_product_id) { case USB_DEVICE_ID_LOGITECH_DFP_WHEEL: return &lg4ff_mode_switch_ext09_dfp; case USB_DEVICE_ID_LOGITECH_DFGT_WHEEL: return &lg4ff_mode_switch_ext09_dfgt; case USB_DEVICE_ID_LOGITECH_G25_WHEEL: return &lg4ff_mode_switch_ext09_g25; case USB_DEVICE_ID_LOGITECH_G27_WHEEL: return &lg4ff_mode_switch_ext09_g27; case USB_DEVICE_ID_LOGITECH_G29_WHEEL: return &lg4ff_mode_switch_ext09_g29; /* G29 can only be switched to DF-EX, DFP, DFGT, G25, G27 or its native mode */ default: return NULL; } break; case USB_DEVICE_ID_LOGITECH_DFGT_WHEEL: switch (target_product_id) { case USB_DEVICE_ID_LOGITECH_WHEEL: return &lg4ff_mode_switch_ext09_dfex; case USB_DEVICE_ID_LOGITECH_DFP_WHEEL: return &lg4ff_mode_switch_ext09_dfp; case USB_DEVICE_ID_LOGITECH_DFGT_WHEEL: return &lg4ff_mode_switch_ext09_dfgt; /* DFGT can only be switched to DF-EX, DFP or its native mode */ default: return NULL; } break; /* No other wheels have multiple modes */ default: return NULL; } } static int lg4ff_switch_compatibility_mode(struct hid_device *hid, const struct lg4ff_compat_mode_switch *s) { struct lg4ff_device_entry *entry; struct lg_drv_data *drv_data; unsigned long flags; s32 *value; u8 i; drv_data = hid_get_drvdata(hid); if (!drv_data) { hid_err(hid, "Private driver data not found!\n"); return -EINVAL; } entry = drv_data->device_props; if (!entry) { hid_err(hid, "Device properties not found!\n"); return -EINVAL; } value = entry->report->field[0]->value; spin_lock_irqsave(&entry->report_lock, flags); for (i = 0; i < s->cmd_count; i++) { u8 j; for (j = 0; j < 7; j++) value[j] = s->cmd[j + (7*i)]; hid_hw_request(hid, entry->report, HID_REQ_SET_REPORT); } spin_unlock_irqrestore(&entry->report_lock, flags); hid_hw_wait(hid); return 0; } static ssize_t lg4ff_alternate_modes_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hid_device *hid = to_hid_device(dev); struct lg4ff_device_entry *entry; struct lg_drv_data *drv_data; ssize_t count = 0; int i; drv_data = hid_get_drvdata(hid); if (!drv_data) { hid_err(hid, "Private driver data not found!\n"); return 0; } entry = drv_data->device_props; if (!entry) { hid_err(hid, "Device properties not found!\n"); return 0; } if (!entry->wdata.real_name) { hid_err(hid, "NULL pointer to string\n"); return 0; } for (i = 0; i < LG4FF_MODE_MAX_IDX; i++) { if (entry->wdata.alternate_modes & BIT(i)) { /* Print tag and full name */ count += sysfs_emit_at(buf, count, "%s: %s", lg4ff_alternate_modes[i].tag, !lg4ff_alternate_modes[i].product_id ? entry->wdata.real_name : lg4ff_alternate_modes[i].name); if (count >= PAGE_SIZE - 1) return count; /* Mark the currently active mode with an asterisk */ if (lg4ff_alternate_modes[i].product_id == entry->wdata.product_id || (lg4ff_alternate_modes[i].product_id == 0 && entry->wdata.product_id == entry->wdata.real_product_id)) count += sysfs_emit_at(buf, count, " *\n"); else count += sysfs_emit_at(buf, count, "\n"); if (count >= PAGE_SIZE - 1) return count; } } return count; } static ssize_t lg4ff_alternate_modes_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct hid_device *hid = to_hid_device(dev); struct lg4ff_device_entry *entry; struct lg_drv_data *drv_data; const struct lg4ff_compat_mode_switch *s; u16 target_product_id = 0; int i, ret; char *lbuf; drv_data = hid_get_drvdata(hid); if (!drv_data) { hid_err(hid, "Private driver data not found!\n"); return -EINVAL; } entry = drv_data->device_props; if (!entry) { hid_err(hid, "Device properties not found!\n"); return -EINVAL; } /* Allow \n at the end of the input parameter */ lbuf = kasprintf(GFP_KERNEL, "%s", buf); if (!lbuf) return -ENOMEM; i = strlen(lbuf); if (i == 0) { kfree(lbuf); return -EINVAL; } if (lbuf[i-1] == '\n') { if (i == 1) { kfree(lbuf); return -EINVAL; } lbuf[i-1] = '\0'; } for (i = 0; i < LG4FF_MODE_MAX_IDX; i++) { const u16 mode_product_id = lg4ff_alternate_modes[i].product_id; const char *tag = lg4ff_alternate_modes[i].tag; if (entry->wdata.alternate_modes & BIT(i)) { if (!strcmp(tag, lbuf)) { if (!mode_product_id) target_product_id = entry->wdata.real_product_id; else target_product_id = mode_product_id; break; } } } if (i == LG4FF_MODE_MAX_IDX) { hid_info(hid, "Requested mode \"%s\" is not supported by the device\n", lbuf); kfree(lbuf); return -EINVAL; } kfree(lbuf); /* Not needed anymore */ if (target_product_id == entry->wdata.product_id) /* Nothing to do */ return count; /* Automatic switching has to be disabled for the switch to DF-EX mode to work correctly */ if (target_product_id == USB_DEVICE_ID_LOGITECH_WHEEL && !lg4ff_no_autoswitch) { hid_info(hid, "\"%s\" cannot be switched to \"DF-EX\" mode. Load the \"hid_logitech\" module with \"lg4ff_no_autoswitch=1\" parameter set and try again\n", entry->wdata.real_name); return -EINVAL; } /* Take care of hardware limitations */ if ((entry->wdata.real_product_id == USB_DEVICE_ID_LOGITECH_DFP_WHEEL || entry->wdata.real_product_id == USB_DEVICE_ID_LOGITECH_G25_WHEEL) && entry->wdata.product_id > target_product_id) { hid_info(hid, "\"%s\" cannot be switched back into \"%s\" mode\n", entry->wdata.real_name, lg4ff_alternate_modes[i].name); return -EINVAL; } s = lg4ff_get_mode_switch_command(entry->wdata.real_product_id, target_product_id); if (!s) { hid_err(hid, "Invalid target product ID %X\n", target_product_id); return -EINVAL; } ret = lg4ff_switch_compatibility_mode(hid, s); return (ret == 0 ? count : ret); } static DEVICE_ATTR(alternate_modes, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH, lg4ff_alternate_modes_show, lg4ff_alternate_modes_store); static ssize_t lg4ff_combine_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hid_device *hid = to_hid_device(dev); struct lg4ff_device_entry *entry; struct lg_drv_data *drv_data; size_t count; drv_data = hid_get_drvdata(hid); if (!drv_data) { hid_err(hid, "Private driver data not found!\n"); return 0; } entry = drv_data->device_props; if (!entry) { hid_err(hid, "Device properties not found!\n"); return 0; } count = sysfs_emit(buf, "%u\n", entry->wdata.combine); return count; } static ssize_t lg4ff_combine_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct hid_device *hid = to_hid_device(dev); struct lg4ff_device_entry *entry; struct lg_drv_data *drv_data; u16 combine = simple_strtoul(buf, NULL, 10); drv_data = hid_get_drvdata(hid); if (!drv_data) { hid_err(hid, "Private driver data not found!\n"); return -EINVAL; } entry = drv_data->device_props; if (!entry) { hid_err(hid, "Device properties not found!\n"); return -EINVAL; } if (combine > 1) combine = 1; entry->wdata.combine = combine; return count; } static DEVICE_ATTR(combine_pedals, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH, lg4ff_combine_show, lg4ff_combine_store); /* Export the currently set range of the wheel */ static ssize_t lg4ff_range_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hid_device *hid = to_hid_device(dev); struct lg4ff_device_entry *entry; struct lg_drv_data *drv_data; size_t count; drv_data = hid_get_drvdata(hid); if (!drv_data) { hid_err(hid, "Private driver data not found!\n"); return 0; } entry = drv_data->device_props; if (!entry) { hid_err(hid, "Device properties not found!\n"); return 0; } count = sysfs_emit(buf, "%u\n", entry->wdata.range); return count; } /* Set range to user specified value, call appropriate function * according to the type of the wheel */ static ssize_t lg4ff_range_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct hid_device *hid = to_hid_device(dev); struct lg4ff_device_entry *entry; struct lg_drv_data *drv_data; u16 range = simple_strtoul(buf, NULL, 10); drv_data = hid_get_drvdata(hid); if (!drv_data) { hid_err(hid, "Private driver data not found!\n"); return -EINVAL; } entry = drv_data->device_props; if (!entry) { hid_err(hid, "Device properties not found!\n"); return -EINVAL; } if (range == 0) range = entry->wdata.max_range; /* Check if the wheel supports range setting * and that the range is within limits for the wheel */ if (entry->wdata.set_range && range >= entry->wdata.min_range && range <= entry->wdata.max_range) { entry->wdata.set_range(hid, range); entry->wdata.range = range; } return count; } static DEVICE_ATTR(range, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH, lg4ff_range_show, lg4ff_range_store); static ssize_t lg4ff_real_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hid_device *hid = to_hid_device(dev); struct lg4ff_device_entry *entry; struct lg_drv_data *drv_data; size_t count; drv_data = hid_get_drvdata(hid); if (!drv_data) { hid_err(hid, "Private driver data not found!\n"); return 0; } entry = drv_data->device_props; if (!entry) { hid_err(hid, "Device properties not found!\n"); return 0; } if (!entry->wdata.real_tag || !entry->wdata.real_name) { hid_err(hid, "NULL pointer to string\n"); return 0; } count = sysfs_emit(buf, "%s: %s\n", entry->wdata.real_tag, entry->wdata.real_name); return count; } static ssize_t lg4ff_real_id_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { /* Real ID is a read-only value */ return -EPERM; } static DEVICE_ATTR(real_id, S_IRUGO, lg4ff_real_id_show, lg4ff_real_id_store); #ifdef CONFIG_LEDS_CLASS static void lg4ff_set_leds(struct hid_device *hid, u8 leds) { struct lg_drv_data *drv_data; struct lg4ff_device_entry *entry; unsigned long flags; s32 *value; drv_data = hid_get_drvdata(hid); if (!drv_data) { hid_err(hid, "Private driver data not found!\n"); return; } entry = drv_data->device_props; if (!entry) { hid_err(hid, "Device properties not found!\n"); return; } value = entry->report->field[0]->value; spin_lock_irqsave(&entry->report_lock, flags); value[0] = 0xf8; value[1] = 0x12; value[2] = leds; value[3] = 0x00; value[4] = 0x00; value[5] = 0x00; value[6] = 0x00; hid_hw_request(hid, entry->report, HID_REQ_SET_REPORT); spin_unlock_irqrestore(&entry->report_lock, flags); } static void lg4ff_led_set_brightness(struct led_classdev *led_cdev, enum led_brightness value) { struct device *dev = led_cdev->dev->parent; struct hid_device *hid = to_hid_device(dev); struct lg_drv_data *drv_data = hid_get_drvdata(hid); struct lg4ff_device_entry *entry; int i, state = 0; if (!drv_data) { hid_err(hid, "Device data not found."); return; } entry = drv_data->device_props; if (!entry) { hid_err(hid, "Device properties not found."); return; } for (i = 0; i < 5; i++) { if (led_cdev != entry->wdata.led[i]) continue; state = (entry->wdata.led_state >> i) & 1; if (value == LED_OFF && state) { entry->wdata.led_state &= ~(1 << i); lg4ff_set_leds(hid, entry->wdata.led_state); } else if (value != LED_OFF && !state) { entry->wdata.led_state |= 1 << i; lg4ff_set_leds(hid, entry->wdata.led_state); } break; } } static enum led_brightness lg4ff_led_get_brightness(struct led_classdev *led_cdev) { struct device *dev = led_cdev->dev->parent; struct hid_device *hid = to_hid_device(dev); struct lg_drv_data *drv_data = hid_get_drvdata(hid); struct lg4ff_device_entry *entry; int i, value = 0; if (!drv_data) { hid_err(hid, "Device data not found."); return LED_OFF; } entry = drv_data->device_props; if (!entry) { hid_err(hid, "Device properties not found."); return LED_OFF; } for (i = 0; i < 5; i++) if (led_cdev == entry->wdata.led[i]) { value = (entry->wdata.led_state >> i) & 1; break; } return value ? LED_FULL : LED_OFF; } #endif static u16 lg4ff_identify_multimode_wheel(struct hid_device *hid, const u16 reported_product_id, const u16 bcdDevice) { u32 current_mode; int i; /* identify current mode from USB PID */ for (i = 1; i < ARRAY_SIZE(lg4ff_alternate_modes); i++) { dbg_hid("Testing whether PID is %X\n", lg4ff_alternate_modes[i].product_id); if (reported_product_id == lg4ff_alternate_modes[i].product_id) break; } if (i == ARRAY_SIZE(lg4ff_alternate_modes)) return 0; current_mode = BIT(i); for (i = 0; i < ARRAY_SIZE(lg4ff_main_checklist); i++) { const u16 mask = lg4ff_main_checklist[i]->mask; const u16 result = lg4ff_main_checklist[i]->result; const u16 real_product_id = lg4ff_main_checklist[i]->real_product_id; if ((current_mode & lg4ff_main_checklist[i]->modes) && \ (bcdDevice & mask) == result) { dbg_hid("Found wheel with real PID %X whose reported PID is %X\n", real_product_id, reported_product_id); return real_product_id; } } /* No match found. This is either Driving Force or an unknown * wheel model, do not touch it */ dbg_hid("Wheel with bcdDevice %X was not recognized as multimode wheel, leaving in its current mode\n", bcdDevice); return 0; } static int lg4ff_handle_multimode_wheel(struct hid_device *hid, u16 *real_product_id, const u16 bcdDevice) { const u16 reported_product_id = hid->product; int ret; *real_product_id = lg4ff_identify_multimode_wheel(hid, reported_product_id, bcdDevice); /* Probed wheel is not a multimode wheel */ if (!*real_product_id) { *real_product_id = reported_product_id; dbg_hid("Wheel is not a multimode wheel\n"); return LG4FF_MMODE_NOT_MULTIMODE; } /* Switch from "Driving Force" mode to native mode automatically. * Otherwise keep the wheel in its current mode */ if (reported_product_id == USB_DEVICE_ID_LOGITECH_WHEEL && reported_product_id != *real_product_id && !lg4ff_no_autoswitch) { const struct lg4ff_compat_mode_switch *s = lg4ff_get_mode_switch_command(*real_product_id, *real_product_id); if (!s) { hid_err(hid, "Invalid product id %X\n", *real_product_id); return LG4FF_MMODE_NOT_MULTIMODE; } ret = lg4ff_switch_compatibility_mode(hid, s); if (ret) { /* Wheel could not have been switched to native mode, * leave it in "Driving Force" mode and continue */ hid_err(hid, "Unable to switch wheel mode, errno %d\n", ret); return LG4FF_MMODE_IS_MULTIMODE; } return LG4FF_MMODE_SWITCHED; } return LG4FF_MMODE_IS_MULTIMODE; } int lg4ff_init(struct hid_device *hid) { struct hid_input *hidinput; struct input_dev *dev; struct list_head *report_list = &hid->report_enum[HID_OUTPUT_REPORT].report_list; struct hid_report *report = list_entry(report_list->next, struct hid_report, list); const struct usb_device_descriptor *udesc = &(hid_to_usb_dev(hid)->descriptor); const u16 bcdDevice = le16_to_cpu(udesc->bcdDevice); const struct lg4ff_multimode_wheel *mmode_wheel = NULL; struct lg4ff_device_entry *entry; struct lg_drv_data *drv_data; int error, i, j; int mmode_ret, mmode_idx = -1; u16 real_product_id; if (list_empty(&hid->inputs)) { hid_err(hid, "no inputs found\n"); return -ENODEV; } hidinput = list_entry(hid->inputs.next, struct hid_input, list); dev = hidinput->input; /* Check that the report looks ok */ if (!hid_validate_values(hid, HID_OUTPUT_REPORT, 0, 0, 7)) return -1; drv_data = hid_get_drvdata(hid); if (!drv_data) { hid_err(hid, "Cannot add device, private driver data not allocated\n"); return -1; } entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; spin_lock_init(&entry->report_lock); entry->report = report; drv_data->device_props = entry; /* Check if a multimode wheel has been connected and * handle it appropriately */ mmode_ret = lg4ff_handle_multimode_wheel(hid, &real_product_id, bcdDevice); /* Wheel has been told to switch to native mode. There is no point in going on * with the initialization as the wheel will do a USB reset when it switches mode */ if (mmode_ret == LG4FF_MMODE_SWITCHED) return 0; else if (mmode_ret < 0) { hid_err(hid, "Unable to switch device mode during initialization, errno %d\n", mmode_ret); error = mmode_ret; goto err_init; } /* Check what wheel has been connected */ for (i = 0; i < ARRAY_SIZE(lg4ff_devices); i++) { if (hid->product == lg4ff_devices[i].product_id) { dbg_hid("Found compatible device, product ID %04X\n", lg4ff_devices[i].product_id); break; } } if (i == ARRAY_SIZE(lg4ff_devices)) { hid_err(hid, "This device is flagged to be handled by the lg4ff module but this module does not know how to handle it. " "Please report this as a bug to LKML, Simon Wood <simon@mungewell.org> or " "Michal Maly <madcatxster@devoid-pointer.net>\n"); error = -1; goto err_init; } if (mmode_ret == LG4FF_MMODE_IS_MULTIMODE) { for (mmode_idx = 0; mmode_idx < ARRAY_SIZE(lg4ff_multimode_wheels); mmode_idx++) { if (real_product_id == lg4ff_multimode_wheels[mmode_idx].product_id) break; } if (mmode_idx == ARRAY_SIZE(lg4ff_multimode_wheels)) { hid_err(hid, "Device product ID %X is not listed as a multimode wheel", real_product_id); error = -1; goto err_init; } } /* Set supported force feedback capabilities */ for (j = 0; lg4ff_devices[i].ff_effects[j] >= 0; j++) set_bit(lg4ff_devices[i].ff_effects[j], dev->ffbit); error = input_ff_create_memless(dev, NULL, lg4ff_play); if (error) goto err_init; /* Initialize device properties */ if (mmode_ret == LG4FF_MMODE_IS_MULTIMODE) { if (WARN_ON(mmode_idx == -1)) return -EINVAL; mmode_wheel = &lg4ff_multimode_wheels[mmode_idx]; } lg4ff_init_wheel_data(&entry->wdata, &lg4ff_devices[i], mmode_wheel, real_product_id); /* Check if autocentering is available and * set the centering force to zero by default */ if (test_bit(FF_AUTOCENTER, dev->ffbit)) { /* Formula Force EX expects different autocentering command */ if ((bcdDevice >> 8) == LG4FF_FFEX_REV_MAJ && (bcdDevice & 0xff) == LG4FF_FFEX_REV_MIN) dev->ff->set_autocenter = lg4ff_set_autocenter_ffex; else dev->ff->set_autocenter = lg4ff_set_autocenter_default; dev->ff->set_autocenter(dev, 0); } /* Create sysfs interface */ error = device_create_file(&hid->dev, &dev_attr_combine_pedals); if (error) hid_warn(hid, "Unable to create sysfs interface for \"combine\", errno %d\n", error); error = device_create_file(&hid->dev, &dev_attr_range); if (error) hid_warn(hid, "Unable to create sysfs interface for \"range\", errno %d\n", error); if (mmode_ret == LG4FF_MMODE_IS_MULTIMODE) { error = device_create_file(&hid->dev, &dev_attr_real_id); if (error) hid_warn(hid, "Unable to create sysfs interface for \"real_id\", errno %d\n", error); error = device_create_file(&hid->dev, &dev_attr_alternate_modes); if (error) hid_warn(hid, "Unable to create sysfs interface for \"alternate_modes\", errno %d\n", error); } dbg_hid("sysfs interface created\n"); /* Set the maximum range to start with */ entry->wdata.range = entry->wdata.max_range; if (entry->wdata.set_range) entry->wdata.set_range(hid, entry->wdata.range); #ifdef CONFIG_LEDS_CLASS /* register led subsystem - G27/G29 only */ entry->wdata.led_state = 0; for (j = 0; j < 5; j++) entry->wdata.led[j] = NULL; if (lg4ff_devices[i].product_id == USB_DEVICE_ID_LOGITECH_G27_WHEEL || lg4ff_devices[i].product_id == USB_DEVICE_ID_LOGITECH_G29_WHEEL) { struct led_classdev *led; size_t name_sz; char *name; lg4ff_set_leds(hid, 0); name_sz = strlen(dev_name(&hid->dev)) + 8; for (j = 0; j < 5; j++) { led = kzalloc(sizeof(struct led_classdev)+name_sz, GFP_KERNEL); if (!led) { hid_err(hid, "can't allocate memory for LED %d\n", j); goto err_leds; } name = (void *)(&led[1]); snprintf(name, name_sz, "%s::RPM%d", dev_name(&hid->dev), j+1); led->name = name; led->brightness = 0; led->max_brightness = 1; led->brightness_get = lg4ff_led_get_brightness; led->brightness_set = lg4ff_led_set_brightness; entry->wdata.led[j] = led; error = led_classdev_register(&hid->dev, led); if (error) { hid_err(hid, "failed to register LED %d. Aborting.\n", j); err_leds: /* Deregister LEDs (if any) */ for (j = 0; j < 5; j++) { led = entry->wdata.led[j]; entry->wdata.led[j] = NULL; if (!led) continue; led_classdev_unregister(led); kfree(led); } goto out; /* Let the driver continue without LEDs */ } } } out: #endif hid_info(hid, "Force feedback support for Logitech Gaming Wheels\n"); return 0; err_init: drv_data->device_props = NULL; kfree(entry); return error; } int lg4ff_deinit(struct hid_device *hid) { struct lg4ff_device_entry *entry; struct lg_drv_data *drv_data; drv_data = hid_get_drvdata(hid); if (!drv_data) { hid_err(hid, "Error while deinitializing device, no private driver data.\n"); return -1; } entry = drv_data->device_props; if (!entry) goto out; /* Nothing more to do */ /* Multimode devices will have at least the "MODE_NATIVE" bit set */ if (entry->wdata.alternate_modes) { device_remove_file(&hid->dev, &dev_attr_real_id); device_remove_file(&hid->dev, &dev_attr_alternate_modes); } device_remove_file(&hid->dev, &dev_attr_combine_pedals); device_remove_file(&hid->dev, &dev_attr_range); #ifdef CONFIG_LEDS_CLASS { int j; struct led_classdev *led; /* Deregister LEDs (if any) */ for (j = 0; j < 5; j++) { led = entry->wdata.led[j]; entry->wdata.led[j] = NULL; if (!led) continue; led_classdev_unregister(led); kfree(led); } } #endif drv_data->device_props = NULL; kfree(entry); out: dbg_hid("Device successfully unregistered\n"); return 0; }
2 16 98 1 4244 102 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 2 2 505 503 2 2 501 481 54 195 183 16 16 184 99 98 22 421 29 402 3 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 /* * kernel/cpuset.c * * Processor and Memory placement constraints for sets of tasks. * * Copyright (C) 2003 BULL SA. * Copyright (C) 2004-2007 Silicon Graphics, Inc. * Copyright (C) 2006 Google, Inc * * Portions derived from Patrick Mochel's sysfs code. * sysfs is Copyright (c) 2001-3 Patrick Mochel * * 2003-10-10 Written by Simon Derr. * 2003-10-22 Updates by Stephen Hemminger. * 2004 May-July Rework by Paul Jackson. * 2006 Rework by Paul Menage to use generic cgroups * 2008 Rework of the scheduler domains and CPU hotplug handling * by Max Krasnyansky * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of the Linux * distribution for more details. */ #include "cpuset-internal.h" #include <linux/init.h> #include <linux/interrupt.h> #include <linux/kernel.h> #include <linux/mempolicy.h> #include <linux/mm.h> #include <linux/memory.h> #include <linux/export.h> #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/sched/deadline.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/security.h> #include <linux/oom.h> #include <linux/sched/isolation.h> #include <linux/wait.h> #include <linux/workqueue.h> #include <linux/task_work.h> DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); /* * There could be abnormal cpuset configurations for cpu or memory * node binding, add this key to provide a quick low-cost judgment * of the situation. */ DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key); static const char * const perr_strings[] = { [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus.exclusive", [PERR_INVPARENT] = "Parent is an invalid partition root", [PERR_NOTPART] = "Parent is not a partition root", [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive", [PERR_NOCPUS] = "Parent unable to distribute cpu downstream", [PERR_HOTPLUG] = "No cpu available due to hotplug", [PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty", [PERR_HKEEPING] = "partition config conflicts with housekeeping setup", [PERR_ACCESS] = "Enable partition not permitted", [PERR_REMOTE] = "Have remote partition underneath", }; /* * For local partitions, update to subpartitions_cpus & isolated_cpus is done * in update_parent_effective_cpumask(). For remote partitions, it is done in * the remote_partition_*() and remote_cpus_update() helpers. */ /* * Exclusive CPUs distributed out to local or remote sub-partitions of * top_cpuset */ static cpumask_var_t subpartitions_cpus; /* * Exclusive CPUs in isolated partitions */ static cpumask_var_t isolated_cpus; /* * Housekeeping (HK_TYPE_DOMAIN) CPUs at boot */ static cpumask_var_t boot_hk_cpus; static bool have_boot_isolcpus; /* List of remote partition root children */ static struct list_head remote_children; /* * A flag to force sched domain rebuild at the end of an operation. * It can be set in * - update_partition_sd_lb() * - update_cpumasks_hier() * - cpuset_update_flag() * - cpuset_hotplug_update_tasks() * - cpuset_handle_hotplug() * * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock. * * Note that update_relax_domain_level() in cpuset-v1.c can still call * rebuild_sched_domains_locked() directly without using this flag. */ static bool force_sd_rebuild; /* * Partition root states: * * 0 - member (not a partition root) * 1 - partition root * 2 - partition root without load balancing (isolated) * -1 - invalid partition root * -2 - invalid isolated partition root * * There are 2 types of partitions - local or remote. Local partitions are * those whose parents are partition root themselves. Setting of * cpuset.cpus.exclusive are optional in setting up local partitions. * Remote partitions are those whose parents are not partition roots. Passing * down exclusive CPUs by setting cpuset.cpus.exclusive along its ancestor * nodes are mandatory in creating a remote partition. * * For simplicity, a local partition can be created under a local or remote * partition but a remote partition cannot have any partition root in its * ancestor chain except the cgroup root. */ #define PRS_MEMBER 0 #define PRS_ROOT 1 #define PRS_ISOLATED 2 #define PRS_INVALID_ROOT -1 #define PRS_INVALID_ISOLATED -2 /* * Temporary cpumasks for working with partitions that are passed among * functions to avoid memory allocation in inner functions. */ struct tmpmasks { cpumask_var_t addmask, delmask; /* For partition root */ cpumask_var_t new_cpus; /* For update_cpumasks_hier() */ }; void inc_dl_tasks_cs(struct task_struct *p) { struct cpuset *cs = task_cs(p); cs->nr_deadline_tasks++; } void dec_dl_tasks_cs(struct task_struct *p) { struct cpuset *cs = task_cs(p); cs->nr_deadline_tasks--; } static inline bool is_partition_valid(const struct cpuset *cs) { return cs->partition_root_state > 0; } static inline bool is_partition_invalid(const struct cpuset *cs) { return cs->partition_root_state < 0; } static inline bool cs_is_member(const struct cpuset *cs) { return cs->partition_root_state == PRS_MEMBER; } /* * Callers should hold callback_lock to modify partition_root_state. */ static inline void make_partition_invalid(struct cpuset *cs) { if (cs->partition_root_state > 0) cs->partition_root_state = -cs->partition_root_state; } /* * Send notification event of whenever partition_root_state changes. */ static inline void notify_partition_change(struct cpuset *cs, int old_prs) { if (old_prs == cs->partition_root_state) return; cgroup_file_notify(&cs->partition_file); /* Reset prs_err if not invalid */ if (is_partition_valid(cs)) WRITE_ONCE(cs->prs_err, PERR_NONE); } /* * The top_cpuset is always synchronized to cpu_active_mask and we should avoid * using cpu_online_mask as much as possible. An active CPU is always an online * CPU, but not vice versa. cpu_active_mask and cpu_online_mask can differ * during hotplug operations. A CPU is marked active at the last stage of CPU * bringup (CPUHP_AP_ACTIVE). It is also the stage where cpuset hotplug code * will be called to update the sched domains so that the scheduler can move * a normal task to a newly active CPU or remove tasks away from a newly * inactivated CPU. The online bit is set much earlier in the CPU bringup * process and cleared much later in CPU teardown. * * If cpu_online_mask is used while a hotunplug operation is happening in * parallel, we may leave an offline CPU in cpu_allowed or some other masks. */ static struct cpuset top_cpuset = { .flags = BIT(CS_CPU_EXCLUSIVE) | BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state = PRS_ROOT, .relax_domain_level = -1, .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling), }; /* * There are two global locks guarding cpuset structures - cpuset_mutex and * callback_lock. The cpuset code uses only cpuset_mutex. Other kernel * subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset * structures. Note that cpuset_mutex needs to be a mutex as it is used in * paths that rely on priority inheritance (e.g. scheduler - on RT) for * correctness. * * A task must hold both locks to modify cpusets. If a task holds * cpuset_mutex, it blocks others, ensuring that it is the only task able to * also acquire callback_lock and be able to modify cpusets. It can perform * various checks on the cpuset structure first, knowing nothing will change. * It can also allocate memory while just holding cpuset_mutex. While it is * performing these checks, various callback routines can briefly acquire * callback_lock to query cpusets. Once it is ready to make the changes, it * takes callback_lock, blocking everyone else. * * Calls to the kernel memory allocator can not be made while holding * callback_lock, as that would risk double tripping on callback_lock * from one of the callbacks into the cpuset code from within * __alloc_pages(). * * If a task is only holding callback_lock, then it has read-only * access to cpusets. * * Now, the task_struct fields mems_allowed and mempolicy may be changed * by other task, we use alloc_lock in the task_struct fields to protect * them. * * The cpuset_common_seq_show() handlers only hold callback_lock across * small pieces of code, such as when reading out possibly multi-word * cpumasks and nodemasks. */ static DEFINE_MUTEX(cpuset_mutex); /** * cpuset_lock - Acquire the global cpuset mutex * * This locks the global cpuset mutex to prevent modifications to cpuset * hierarchy and configurations. This helper is not enough to make modification. */ void cpuset_lock(void) { mutex_lock(&cpuset_mutex); } void cpuset_unlock(void) { mutex_unlock(&cpuset_mutex); } /** * cpuset_full_lock - Acquire full protection for cpuset modification * * Takes both CPU hotplug read lock (cpus_read_lock()) and cpuset mutex * to safely modify cpuset data. */ void cpuset_full_lock(void) { cpus_read_lock(); mutex_lock(&cpuset_mutex); } void cpuset_full_unlock(void) { mutex_unlock(&cpuset_mutex); cpus_read_unlock(); } static DEFINE_SPINLOCK(callback_lock); void cpuset_callback_lock_irq(void) { spin_lock_irq(&callback_lock); } void cpuset_callback_unlock_irq(void) { spin_unlock_irq(&callback_lock); } static struct workqueue_struct *cpuset_migrate_mm_wq; static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); static inline void check_insane_mems_config(nodemask_t *nodes) { if (!cpusets_insane_config() && movable_only_nodes(nodes)) { static_branch_enable_cpuslocked(&cpusets_insane_config_key); pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n" "Cpuset allocations might fail even with a lot of memory available.\n", nodemask_pr_args(nodes)); } } /* * decrease cs->attach_in_progress. * wake_up cpuset_attach_wq if cs->attach_in_progress==0. */ static inline void dec_attach_in_progress_locked(struct cpuset *cs) { lockdep_assert_held(&cpuset_mutex); cs->attach_in_progress--; if (!cs->attach_in_progress) wake_up(&cpuset_attach_wq); } static inline void dec_attach_in_progress(struct cpuset *cs) { mutex_lock(&cpuset_mutex); dec_attach_in_progress_locked(cs); mutex_unlock(&cpuset_mutex); } static inline bool cpuset_v2(void) { return !IS_ENABLED(CONFIG_CPUSETS_V1) || cgroup_subsys_on_dfl(cpuset_cgrp_subsys); } /* * Cgroup v2 behavior is used on the "cpus" and "mems" control files when * on default hierarchy or when the cpuset_v2_mode flag is set by mounting * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option. * With v2 behavior, "cpus" and "mems" are always what the users have * requested and won't be changed by hotplug events. Only the effective * cpus or mems will be affected. */ static inline bool is_in_v2_mode(void) { return cpuset_v2() || (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); } /** * partition_is_populated - check if partition has tasks * @cs: partition root to be checked * @excluded_child: a child cpuset to be excluded in task checking * Return: true if there are tasks, false otherwise * * It is assumed that @cs is a valid partition root. @excluded_child should * be non-NULL when this cpuset is going to become a partition itself. */ static inline bool partition_is_populated(struct cpuset *cs, struct cpuset *excluded_child) { struct cgroup_subsys_state *css; struct cpuset *child; if (cs->css.cgroup->nr_populated_csets) return true; if (!excluded_child && !cs->nr_subparts) return cgroup_is_populated(cs->css.cgroup); rcu_read_lock(); cpuset_for_each_child(child, css, cs) { if (child == excluded_child) continue; if (is_partition_valid(child)) continue; if (cgroup_is_populated(child->css.cgroup)) { rcu_read_unlock(); return true; } } rcu_read_unlock(); return false; } /* * Return in pmask the portion of a task's cpusets's cpus_allowed that * are online and are capable of running the task. If none are found, * walk up the cpuset hierarchy until we find one that does have some * appropriate cpus. * * One way or another, we guarantee to return some non-empty subset * of cpu_active_mask. * * Call with callback_lock or cpuset_mutex held. */ static void guarantee_active_cpus(struct task_struct *tsk, struct cpumask *pmask) { const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); struct cpuset *cs; if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_active_mask))) cpumask_copy(pmask, cpu_active_mask); rcu_read_lock(); cs = task_cs(tsk); while (!cpumask_intersects(cs->effective_cpus, pmask)) cs = parent_cs(cs); cpumask_and(pmask, pmask, cs->effective_cpus); rcu_read_unlock(); } /* * Return in *pmask the portion of a cpusets's mems_allowed that * are online, with memory. If none are online with memory, walk * up the cpuset hierarchy until we find one that does have some * online mems. The top cpuset always has some mems online. * * One way or another, we guarantee to return some non-empty subset * of node_states[N_MEMORY]. * * Call with callback_lock or cpuset_mutex held. */ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) { while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY])) cs = parent_cs(cs); nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); } /** * alloc_cpumasks - Allocate an array of cpumask variables * @pmasks: Pointer to array of cpumask_var_t pointers * @size: Number of cpumasks to allocate * Return: 0 if successful, -ENOMEM otherwise. * * Allocates @size cpumasks and initializes them to empty. Returns 0 on * success, -ENOMEM on allocation failure. On failure, any previously * allocated cpumasks are freed. */ static inline int alloc_cpumasks(cpumask_var_t *pmasks[], u32 size) { int i; for (i = 0; i < size; i++) { if (!zalloc_cpumask_var(pmasks[i], GFP_KERNEL)) { while (--i >= 0) free_cpumask_var(*pmasks[i]); return -ENOMEM; } } return 0; } /** * alloc_tmpmasks - Allocate temporary cpumasks for cpuset operations. * @tmp: Pointer to tmpmasks structure to populate * Return: 0 on success, -ENOMEM on allocation failure */ static inline int alloc_tmpmasks(struct tmpmasks *tmp) { /* * Array of pointers to the three cpumask_var_t fields in tmpmasks. * Note: Array size must match actual number of masks (3) */ cpumask_var_t *pmask[3] = { &tmp->new_cpus, &tmp->addmask, &tmp->delmask }; return alloc_cpumasks(pmask, ARRAY_SIZE(pmask)); } /** * free_tmpmasks - free cpumasks in a tmpmasks structure * @tmp: the tmpmasks structure pointer */ static inline void free_tmpmasks(struct tmpmasks *tmp) { if (!tmp) return; free_cpumask_var(tmp->new_cpus); free_cpumask_var(tmp->addmask); free_cpumask_var(tmp->delmask); } /** * dup_or_alloc_cpuset - Duplicate or allocate a new cpuset * @cs: Source cpuset to duplicate (NULL for a fresh allocation) * * Creates a new cpuset by either: * 1. Duplicating an existing cpuset (if @cs is non-NULL), or * 2. Allocating a fresh cpuset with zero-initialized masks (if @cs is NULL) * * Return: Pointer to newly allocated cpuset on success, NULL on failure */ static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs) { struct cpuset *trial; /* Allocate base structure */ trial = cs ? kmemdup(cs, sizeof(*cs), GFP_KERNEL) : kzalloc(sizeof(*cs), GFP_KERNEL); if (!trial) return NULL; /* Setup cpumask pointer array */ cpumask_var_t *pmask[4] = { &trial->cpus_allowed, &trial->effective_cpus, &trial->effective_xcpus, &trial->exclusive_cpus }; if (alloc_cpumasks(pmask, ARRAY_SIZE(pmask))) { kfree(trial); return NULL; } /* Copy masks if duplicating */ if (cs) { cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); cpumask_copy(trial->effective_cpus, cs->effective_cpus); cpumask_copy(trial->effective_xcpus, cs->effective_xcpus); cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus); } return trial; } /** * free_cpuset - free the cpuset * @cs: the cpuset to be freed */ static inline void free_cpuset(struct cpuset *cs) { free_cpumask_var(cs->cpus_allowed); free_cpumask_var(cs->effective_cpus); free_cpumask_var(cs->effective_xcpus); free_cpumask_var(cs->exclusive_cpus); kfree(cs); } /* Return user specified exclusive CPUs */ static inline struct cpumask *user_xcpus(struct cpuset *cs) { return cpumask_empty(cs->exclusive_cpus) ? cs->cpus_allowed : cs->exclusive_cpus; } static inline bool xcpus_empty(struct cpuset *cs) { return cpumask_empty(cs->cpus_allowed) && cpumask_empty(cs->exclusive_cpus); } /* * cpusets_are_exclusive() - check if two cpusets are exclusive * * Return true if exclusive, false if not */ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) { struct cpumask *xcpus1 = user_xcpus(cs1); struct cpumask *xcpus2 = user_xcpus(cs2); if (cpumask_intersects(xcpus1, xcpus2)) return false; return true; } /** * cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts * @cs1: first cpuset to check * @cs2: second cpuset to check * * Returns: true if CPU exclusivity conflict exists, false otherwise * * Conflict detection rules: * 1. If either cpuset is CPU exclusive, they must be mutually exclusive * 2. exclusive_cpus masks cannot intersect between cpusets * 3. The allowed CPUs of one cpuset cannot be a subset of another's exclusive CPUs */ static inline bool cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) { /* If either cpuset is exclusive, check if they are mutually exclusive */ if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2)) return !cpusets_are_exclusive(cs1, cs2); /* Exclusive_cpus cannot intersect */ if (cpumask_intersects(cs1->exclusive_cpus, cs2->exclusive_cpus)) return true; /* The cpus_allowed of one cpuset cannot be a subset of another cpuset's exclusive_cpus */ if (!cpumask_empty(cs1->cpus_allowed) && cpumask_subset(cs1->cpus_allowed, cs2->exclusive_cpus)) return true; if (!cpumask_empty(cs2->cpus_allowed) && cpumask_subset(cs2->cpus_allowed, cs1->exclusive_cpus)) return true; return false; } static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) { if ((is_mem_exclusive(cs1) || is_mem_exclusive(cs2))) return nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); return false; } /* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. * * If we replaced the flag and mask values of the current cpuset * (cur) with those values in the trial cpuset (trial), would * our various subset and exclusive rules still be valid? Presumes * cpuset_mutex held. * * 'cur' is the address of an actual, in-use cpuset. Operations * such as list traversal that depend on the actual address of the * cpuset in the list must use cur below, not trial. * * 'trial' is the address of bulk structure copy of cur, with * perhaps one or more of the fields cpus_allowed, mems_allowed, * or flags changed to new, trial values. * * Return 0 if valid, -errno if not. */ static int validate_change(struct cpuset *cur, struct cpuset *trial) { struct cgroup_subsys_state *css; struct cpuset *c, *par; int ret = 0; rcu_read_lock(); if (!is_in_v2_mode()) ret = cpuset1_validate_change(cur, trial); if (ret) goto out; /* Remaining checks don't apply to root cpuset */ if (cur == &top_cpuset) goto out; par = parent_cs(cur); /* * Cpusets with tasks - existing or newly being attached - can't * be changed to have empty cpus_allowed or mems_allowed. */ ret = -ENOSPC; if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) { if (!cpumask_empty(cur->cpus_allowed) && cpumask_empty(trial->cpus_allowed)) goto out; if (!nodes_empty(cur->mems_allowed) && nodes_empty(trial->mems_allowed)) goto out; } /* * We can't shrink if we won't have enough room for SCHED_DEADLINE * tasks. This check is not done when scheduling is disabled as the * users should know what they are doing. * * For v1, effective_cpus == cpus_allowed & user_xcpus() returns * cpus_allowed. * * For v2, is_cpu_exclusive() & is_sched_load_balance() are true only * for non-isolated partition root. At this point, the target * effective_cpus isn't computed yet. user_xcpus() is the best * approximation. * * TBD: May need to precompute the real effective_cpus here in case * incorrect scheduling of SCHED_DEADLINE tasks in a partition * becomes an issue. */ ret = -EBUSY; if (is_cpu_exclusive(cur) && is_sched_load_balance(cur) && !cpuset_cpumask_can_shrink(cur->effective_cpus, user_xcpus(trial))) goto out; /* * If either I or some sibling (!= me) is exclusive, we can't * overlap. exclusive_cpus cannot overlap with each other if set. */ ret = -EINVAL; cpuset_for_each_child(c, css, par) { if (c == cur) continue; if (cpus_excl_conflict(trial, c)) goto out; if (mems_excl_conflict(trial, c)) goto out; } ret = 0; out: rcu_read_unlock(); return ret; } #ifdef CONFIG_SMP /* * Helper routine for generate_sched_domains(). * Do cpusets a, b have overlapping effective cpus_allowed masks? */ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) { return cpumask_intersects(a->effective_cpus, b->effective_cpus); } static void update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) { if (dattr->relax_domain_level < c->relax_domain_level) dattr->relax_domain_level = c->relax_domain_level; return; } static void update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *root_cs) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { /* skip the whole subtree if @cp doesn't have any CPU */ if (cpumask_empty(cp->cpus_allowed)) { pos_css = css_rightmost_descendant(pos_css); continue; } if (is_sched_load_balance(cp)) update_domain_attr(dattr, cp); } rcu_read_unlock(); } /* Must be called with cpuset_mutex held. */ static inline int nr_cpusets(void) { /* jump label reference count + the top-level cpuset */ return static_key_count(&cpusets_enabled_key.key) + 1; } /* * generate_sched_domains() * * This function builds a partial partition of the systems CPUs * A 'partial partition' is a set of non-overlapping subsets whose * union is a subset of that set. * The output of this function needs to be passed to kernel/sched/core.c * partition_sched_domains() routine, which will rebuild the scheduler's * load balancing domains (sched domains) as specified by that partial * partition. * * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst * for a background explanation of this. * * Does not return errors, on the theory that the callers of this * routine would rather not worry about failures to rebuild sched * domains when operating in the severe memory shortage situations * that could cause allocation failures below. * * Must be called with cpuset_mutex held. * * The three key local variables below are: * cp - cpuset pointer, used (together with pos_css) to perform a * top-down scan of all cpusets. For our purposes, rebuilding * the schedulers sched domains, we can ignore !is_sched_load_ * balance cpusets. * csa - (for CpuSet Array) Array of pointers to all the cpusets * that need to be load balanced, for convenient iterative * access by the subsequent code that finds the best partition, * i.e the set of domains (subsets) of CPUs such that the * cpus_allowed of every cpuset marked is_sched_load_balance * is a subset of one of these domains, while there are as * many such domains as possible, each as small as possible. * doms - Conversion of 'csa' to an array of cpumasks, for passing to * the kernel/sched/core.c routine partition_sched_domains() in a * convenient format, that can be easily compared to the prior * value to determine what partition elements (sched domains) * were changed (added or removed.) * * Finding the best partition (set of domains): * The double nested loops below over i, j scan over the load * balanced cpusets (using the array of cpuset pointers in csa[]) * looking for pairs of cpusets that have overlapping cpus_allowed * and merging them using a union-find algorithm. * * The union of the cpus_allowed masks from the set of all cpusets * having the same root then form the one element of the partition * (one sched domain) to be passed to partition_sched_domains(). * */ static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { struct cpuset *cp; /* top-down scan of cpusets */ struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ int i, j; /* indices for partition finding loops */ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ struct cgroup_subsys_state *pos_css; bool root_load_balance = is_sched_load_balance(&top_cpuset); bool cgrpv2 = cpuset_v2(); int nslot_update; doms = NULL; dattr = NULL; csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ if (root_load_balance && cpumask_empty(subpartitions_cpus)) { single_root_domain: ndoms = 1; doms = alloc_sched_domains(ndoms); if (!doms) goto done; dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); if (dattr) { *dattr = SD_ATTR_INIT; update_domain_attr_tree(dattr, &top_cpuset); } cpumask_and(doms[0], top_cpuset.effective_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN)); goto done; } csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL); if (!csa) goto done; csn = 0; rcu_read_lock(); if (root_load_balance) csa[csn++] = &top_cpuset; cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { if (cp == &top_cpuset) continue; if (cgrpv2) goto v2; /* * v1: * Continue traversing beyond @cp iff @cp has some CPUs and * isn't load balancing. The former is obvious. The * latter: All child cpusets contain a subset of the * parent's cpus, so just skip them, and then we call * update_domain_attr_tree() to calc relax_domain_level of * the corresponding sched domain. */ if (!cpumask_empty(cp->cpus_allowed) && !(is_sched_load_balance(cp) && cpumask_intersects(cp->cpus_allowed, housekeeping_cpumask(HK_TYPE_DOMAIN)))) continue; if (is_sched_load_balance(cp) && !cpumask_empty(cp->effective_cpus)) csa[csn++] = cp; /* skip @cp's subtree */ pos_css = css_rightmost_descendant(pos_css); continue; v2: /* * Only valid partition roots that are not isolated and with * non-empty effective_cpus will be saved into csn[]. */ if ((cp->partition_root_state == PRS_ROOT) && !cpumask_empty(cp->effective_cpus)) csa[csn++] = cp; /* * Skip @cp's subtree if not a partition root and has no * exclusive CPUs to be granted to child cpusets. */ if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus)) pos_css = css_rightmost_descendant(pos_css); } rcu_read_unlock(); /* * If there are only isolated partitions underneath the cgroup root, * we can optimize out unneeded sched domains scanning. */ if (root_load_balance && (csn == 1)) goto single_root_domain; for (i = 0; i < csn; i++) uf_node_init(&csa[i]->node); /* Merge overlapping cpusets */ for (i = 0; i < csn; i++) { for (j = i + 1; j < csn; j++) { if (cpusets_overlap(csa[i], csa[j])) { /* * Cgroup v2 shouldn't pass down overlapping * partition root cpusets. */ WARN_ON_ONCE(cgrpv2); uf_union(&csa[i]->node, &csa[j]->node); } } } /* Count the total number of domains */ for (i = 0; i < csn; i++) { if (uf_find(&csa[i]->node) == &csa[i]->node) ndoms++; } /* * Now we know how many domains to create. * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. */ doms = alloc_sched_domains(ndoms); if (!doms) goto done; /* * The rest of the code, including the scheduler, can deal with * dattr==NULL case. No need to abort if alloc fails. */ dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr), GFP_KERNEL); /* * Cgroup v2 doesn't support domain attributes, just set all of them * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a * subset of HK_TYPE_DOMAIN housekeeping CPUs. */ if (cgrpv2) { for (i = 0; i < ndoms; i++) { /* * The top cpuset may contain some boot time isolated * CPUs that need to be excluded from the sched domain. */ if (csa[i] == &top_cpuset) cpumask_and(doms[i], csa[i]->effective_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN)); else cpumask_copy(doms[i], csa[i]->effective_cpus); if (dattr) dattr[i] = SD_ATTR_INIT; } goto done; } for (nslot = 0, i = 0; i < csn; i++) { nslot_update = 0; for (j = i; j < csn; j++) { if (uf_find(&csa[j]->node) == &csa[i]->node) { struct cpumask *dp = doms[nslot]; if (i == j) { nslot_update = 1; cpumask_clear(dp); if (dattr) *(dattr + nslot) = SD_ATTR_INIT; } cpumask_or(dp, dp, csa[j]->effective_cpus); cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN)); if (dattr) update_domain_attr_tree(dattr + nslot, csa[j]); } } if (nslot_update) nslot++; } BUG_ON(nslot != ndoms); done: kfree(csa); /* * Fallback to the default domain if kmalloc() failed. * See comments in partition_sched_domains(). */ if (doms == NULL) ndoms = 1; *domains = doms; *attributes = dattr; return ndoms; } static void dl_update_tasks_root_domain(struct cpuset *cs) { struct css_task_iter it; struct task_struct *task; if (cs->nr_deadline_tasks == 0) return; css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) dl_add_task_root_domain(task); css_task_iter_end(&it); } void dl_rebuild_rd_accounting(void) { struct cpuset *cs = NULL; struct cgroup_subsys_state *pos_css; int cpu; u64 cookie = ++dl_cookie; lockdep_assert_held(&cpuset_mutex); lockdep_assert_cpus_held(); lockdep_assert_held(&sched_domains_mutex); rcu_read_lock(); for_each_possible_cpu(cpu) { if (dl_bw_visited(cpu, cookie)) continue; dl_clear_root_domain_cpu(cpu); } cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { if (cpumask_empty(cs->effective_cpus)) { pos_css = css_rightmost_descendant(pos_css); continue; } css_get(&cs->css); rcu_read_unlock(); dl_update_tasks_root_domain(cs); rcu_read_lock(); css_put(&cs->css); } rcu_read_unlock(); } /* * Rebuild scheduler domains. * * If the flag 'sched_load_balance' of any cpuset with non-empty * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset * which has that flag enabled, or if any cpuset with a non-empty * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * * Call with cpuset_mutex held. Takes cpus_read_lock(). */ void rebuild_sched_domains_locked(void) { struct cgroup_subsys_state *pos_css; struct sched_domain_attr *attr; cpumask_var_t *doms; struct cpuset *cs; int ndoms; lockdep_assert_cpus_held(); lockdep_assert_held(&cpuset_mutex); force_sd_rebuild = false; /* * If we have raced with CPU hotplug, return early to avoid * passing doms with offlined cpu to partition_sched_domains(). * Anyways, cpuset_handle_hotplug() will rebuild sched domains. * * With no CPUs in any subpartitions, top_cpuset's effective CPUs * should be the same as the active CPUs, so checking only top_cpuset * is enough to detect racing CPU offlines. */ if (cpumask_empty(subpartitions_cpus) && !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) return; /* * With subpartition CPUs, however, the effective CPUs of a partition * root should be only a subset of the active CPUs. Since a CPU in any * partition root could be offlined, all must be checked. */ if (!cpumask_empty(subpartitions_cpus)) { rcu_read_lock(); cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { if (!is_partition_valid(cs)) { pos_css = css_rightmost_descendant(pos_css); continue; } if (!cpumask_subset(cs->effective_cpus, cpu_active_mask)) { rcu_read_unlock(); return; } } rcu_read_unlock(); } /* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); } #else /* !CONFIG_SMP */ void rebuild_sched_domains_locked(void) { } #endif /* CONFIG_SMP */ static void rebuild_sched_domains_cpuslocked(void) { mutex_lock(&cpuset_mutex); rebuild_sched_domains_locked(); mutex_unlock(&cpuset_mutex); } void rebuild_sched_domains(void) { cpus_read_lock(); rebuild_sched_domains_cpuslocked(); cpus_read_unlock(); } void cpuset_reset_sched_domains(void) { mutex_lock(&cpuset_mutex); partition_sched_domains(1, NULL, NULL); mutex_unlock(&cpuset_mutex); } /** * cpuset_update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed * @new_cpus: the temp variable for the new effective_cpus mask * * Iterate through each task of @cs updating its cpus_allowed to the * effective cpuset's. As this function is called with cpuset_mutex held, * cpuset membership stays stable. * * For top_cpuset, task_cpu_possible_mask() is used instead of effective_cpus * to make sure all offline CPUs are also included as hotplug code won't * update cpumasks for tasks in top_cpuset. * * As task_cpu_possible_mask() can be task dependent in arm64, we have to * do cpu masking per task instead of doing it once for all. */ void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) { struct css_task_iter it; struct task_struct *task; bool top_cs = cs == &top_cpuset; css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) { const struct cpumask *possible_mask = task_cpu_possible_mask(task); if (top_cs) { /* * PF_NO_SETAFFINITY tasks are ignored. * All per cpu kthreads should have PF_NO_SETAFFINITY * flag set, see kthread_set_per_cpu(). */ if (task->flags & PF_NO_SETAFFINITY) continue; cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus); } else { cpumask_and(new_cpus, possible_mask, cs->effective_cpus); } set_cpus_allowed_ptr(task, new_cpus); } css_task_iter_end(&it); } /** * compute_effective_cpumask - Compute the effective cpumask of the cpuset * @new_cpus: the temp variable for the new effective_cpus mask * @cs: the cpuset the need to recompute the new effective_cpus mask * @parent: the parent cpuset * * The result is valid only if the given cpuset isn't a partition root. */ static void compute_effective_cpumask(struct cpumask *new_cpus, struct cpuset *cs, struct cpuset *parent) { cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus); } /* * Commands for update_parent_effective_cpumask */ enum partition_cmd { partcmd_enable, /* Enable partition root */ partcmd_enablei, /* Enable isolated partition root */ partcmd_disable, /* Disable partition root */ partcmd_update, /* Update parent's effective_cpus */ partcmd_invalidate, /* Make partition invalid */ }; static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, struct tmpmasks *tmp); /* * Update partition exclusive flag * * Return: 0 if successful, an error code otherwise */ static int update_partition_exclusive_flag(struct cpuset *cs, int new_prs) { bool exclusive = (new_prs > PRS_MEMBER); if (exclusive && !is_cpu_exclusive(cs)) { if (cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 1)) return PERR_NOTEXCL; } else if (!exclusive && is_cpu_exclusive(cs)) { /* Turning off CS_CPU_EXCLUSIVE will not return error */ cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 0); } return 0; } /* * Update partition load balance flag and/or rebuild sched domain * * Changing load balance flag will automatically call * rebuild_sched_domains_locked(). * This function is for cgroup v2 only. */ static void update_partition_sd_lb(struct cpuset *cs, int old_prs) { int new_prs = cs->partition_root_state; bool rebuild_domains = (new_prs > 0) || (old_prs > 0); bool new_lb; /* * If cs is not a valid partition root, the load balance state * will follow its parent. */ if (new_prs > 0) { new_lb = (new_prs != PRS_ISOLATED); } else { new_lb = is_sched_load_balance(parent_cs(cs)); } if (new_lb != !!is_sched_load_balance(cs)) { rebuild_domains = true; if (new_lb) set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); else clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); } if (rebuild_domains) cpuset_force_rebuild(); } /* * tasks_nocpu_error - Return true if tasks will have no effective_cpus */ static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs, struct cpumask *xcpus) { /* * A populated partition (cs or parent) can't have empty effective_cpus */ return (cpumask_subset(parent->effective_cpus, xcpus) && partition_is_populated(parent, cs)) || (!cpumask_intersects(xcpus, cpu_active_mask) && partition_is_populated(cs, NULL)); } static void reset_partition_data(struct cpuset *cs) { struct cpuset *parent = parent_cs(cs); if (!cpuset_v2()) return; lockdep_assert_held(&callback_lock); cs->nr_subparts = 0; if (cpumask_empty(cs->exclusive_cpus)) { cpumask_clear(cs->effective_xcpus); if (is_cpu_exclusive(cs)) clear_bit(CS_CPU_EXCLUSIVE, &cs->flags); } if (!cpumask_and(cs->effective_cpus, parent->effective_cpus, cs->cpus_allowed)) cpumask_copy(cs->effective_cpus, parent->effective_cpus); } /* * isolated_cpus_update - Update the isolated_cpus mask * @old_prs: old partition_root_state * @new_prs: new partition_root_state * @xcpus: exclusive CPUs with state change */ static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus) { WARN_ON_ONCE(old_prs == new_prs); if (new_prs == PRS_ISOLATED) cpumask_or(isolated_cpus, isolated_cpus, xcpus); else cpumask_andnot(isolated_cpus, isolated_cpus, xcpus); } /* * partition_xcpus_add - Add new exclusive CPUs to partition * @new_prs: new partition_root_state * @parent: parent cpuset * @xcpus: exclusive CPUs to be added * Return: true if isolated_cpus modified, false otherwise * * Remote partition if parent == NULL */ static bool partition_xcpus_add(int new_prs, struct cpuset *parent, struct cpumask *xcpus) { bool isolcpus_updated; WARN_ON_ONCE(new_prs < 0); lockdep_assert_held(&callback_lock); if (!parent) parent = &top_cpuset; if (parent == &top_cpuset) cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus); isolcpus_updated = (new_prs != parent->partition_root_state); if (isolcpus_updated) isolated_cpus_update(parent->partition_root_state, new_prs, xcpus); cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus); return isolcpus_updated; } /* * partition_xcpus_del - Remove exclusive CPUs from partition * @old_prs: old partition_root_state * @parent: parent cpuset * @xcpus: exclusive CPUs to be removed * Return: true if isolated_cpus modified, false otherwise * * Remote partition if parent == NULL */ static bool partition_xcpus_del(int old_prs, struct cpuset *parent, struct cpumask *xcpus) { bool isolcpus_updated; WARN_ON_ONCE(old_prs < 0); lockdep_assert_held(&callback_lock); if (!parent) parent = &top_cpuset; if (parent == &top_cpuset) cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus); isolcpus_updated = (old_prs != parent->partition_root_state); if (isolcpus_updated) isolated_cpus_update(old_prs, parent->partition_root_state, xcpus); cpumask_and(xcpus, xcpus, cpu_active_mask); cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus); return isolcpus_updated; } static void update_unbound_workqueue_cpumask(bool isolcpus_updated) { int ret; lockdep_assert_cpus_held(); if (!isolcpus_updated) return; ret = workqueue_unbound_exclude_cpumask(isolated_cpus); WARN_ON_ONCE(ret < 0); } /** * cpuset_cpu_is_isolated - Check if the given CPU is isolated * @cpu: the CPU number to be checked * Return: true if CPU is used in an isolated partition, false otherwise */ bool cpuset_cpu_is_isolated(int cpu) { return cpumask_test_cpu(cpu, isolated_cpus); } EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); /** * rm_siblings_excl_cpus - Remove exclusive CPUs that are used by sibling cpusets * @parent: Parent cpuset containing all siblings * @cs: Current cpuset (will be skipped) * @excpus: exclusive effective CPU mask to modify * * This function ensures the given @excpus mask doesn't include any CPUs that * are exclusively allocated to sibling cpusets. It walks through all siblings * of @cs under @parent and removes their exclusive CPUs from @excpus. */ static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs, struct cpumask *excpus) { struct cgroup_subsys_state *css; struct cpuset *sibling; int retval = 0; if (cpumask_empty(excpus)) return retval; /* * Exclude exclusive CPUs from siblings */ rcu_read_lock(); cpuset_for_each_child(sibling, css, parent) { if (sibling == cs) continue; if (cpumask_intersects(excpus, sibling->exclusive_cpus)) { cpumask_andnot(excpus, excpus, sibling->exclusive_cpus); retval++; continue; } if (cpumask_intersects(excpus, sibling->effective_xcpus)) { cpumask_andnot(excpus, excpus, sibling->effective_xcpus); retval++; } } rcu_read_unlock(); return retval; } /* * compute_excpus - compute effective exclusive CPUs * @cs: cpuset * @xcpus: effective exclusive CPUs value to be set * Return: 0 if there is no sibling conflict, > 0 otherwise * * If exclusive_cpus isn't explicitly set , we have to scan the sibling cpusets * and exclude their exclusive_cpus or effective_xcpus as well. */ static int compute_excpus(struct cpuset *cs, struct cpumask *excpus) { struct cpuset *parent = parent_cs(cs); cpumask_and(excpus, user_xcpus(cs), parent->effective_xcpus); if (!cpumask_empty(cs->exclusive_cpus)) return 0; return rm_siblings_excl_cpus(parent, cs, excpus); } /* * compute_trialcs_excpus - Compute effective exclusive CPUs for a trial cpuset * @trialcs: The trial cpuset containing the proposed new configuration * @cs: The original cpuset that the trial configuration is based on * Return: 0 if successful with no sibling conflict, >0 if a conflict is found * * Computes the effective_xcpus for a trial configuration. @cs is provided to represent * the real cs. */ static int compute_trialcs_excpus(struct cpuset *trialcs, struct cpuset *cs) { struct cpuset *parent = parent_cs(trialcs); struct cpumask *excpus = trialcs->effective_xcpus; /* trialcs is member, cpuset.cpus has no impact to excpus */ if (cs_is_member(cs)) cpumask_and(excpus, trialcs->exclusive_cpus, parent->effective_xcpus); else cpumask_and(excpus, user_xcpus(trialcs), parent->effective_xcpus); return rm_siblings_excl_cpus(parent, cs, excpus); } static inline bool is_remote_partition(struct cpuset *cs) { return !list_empty(&cs->remote_sibling); } static inline bool is_local_partition(struct cpuset *cs) { return is_partition_valid(cs) && !is_remote_partition(cs); } /* * remote_partition_enable - Enable current cpuset as a remote partition root * @cs: the cpuset to update * @new_prs: new partition_root_state * @tmp: temporary masks * Return: 0 if successful, errcode if error * * Enable the current cpuset to become a remote partition root taking CPUs * directly from the top cpuset. cpuset_mutex must be held by the caller. */ static int remote_partition_enable(struct cpuset *cs, int new_prs, struct tmpmasks *tmp) { bool isolcpus_updated; /* * The user must have sysadmin privilege. */ if (!capable(CAP_SYS_ADMIN)) return PERR_ACCESS; /* * The requested exclusive_cpus must not be allocated to other * partitions and it can't use up all the root's effective_cpus. * * The effective_xcpus mask can contain offline CPUs, but there must * be at least one or more online CPUs present before it can be enabled. * * Note that creating a remote partition with any local partition root * above it or remote partition root underneath it is not allowed. */ compute_excpus(cs, tmp->new_cpus); WARN_ON_ONCE(cpumask_intersects(tmp->new_cpus, subpartitions_cpus)); if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) || cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) return PERR_INVCPUS; spin_lock_irq(&callback_lock); isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus); list_add(&cs->remote_sibling, &remote_children); cpumask_copy(cs->effective_xcpus, tmp->new_cpus); spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); cpuset_force_rebuild(); cs->prs_err = 0; /* * Propagate changes in top_cpuset's effective_cpus down the hierarchy. */ cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); return 0; } /* * remote_partition_disable - Remove current cpuset from remote partition list * @cs: the cpuset to update * @tmp: temporary masks * * The effective_cpus is also updated. * * cpuset_mutex must be held by the caller. */ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) { bool isolcpus_updated; WARN_ON_ONCE(!is_remote_partition(cs)); WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); spin_lock_irq(&callback_lock); list_del_init(&cs->remote_sibling); isolcpus_updated = partition_xcpus_del(cs->partition_root_state, NULL, cs->effective_xcpus); if (cs->prs_err) cs->partition_root_state = -cs->partition_root_state; else cs->partition_root_state = PRS_MEMBER; /* effective_xcpus may need to be changed */ compute_excpus(cs, cs->effective_xcpus); reset_partition_data(cs); spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); cpuset_force_rebuild(); /* * Propagate changes in top_cpuset's effective_cpus down the hierarchy. */ cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); } /* * remote_cpus_update - cpus_exclusive change of remote partition * @cs: the cpuset to be updated * @xcpus: the new exclusive_cpus mask, if non-NULL * @excpus: the new effective_xcpus mask * @tmp: temporary masks * * top_cpuset and subpartitions_cpus will be updated or partition can be * invalidated. */ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus, struct cpumask *excpus, struct tmpmasks *tmp) { bool adding, deleting; int prs = cs->partition_root_state; int isolcpus_updated = 0; if (WARN_ON_ONCE(!is_remote_partition(cs))) return; WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); if (cpumask_empty(excpus)) { cs->prs_err = PERR_CPUSEMPTY; goto invalidate; } adding = cpumask_andnot(tmp->addmask, excpus, cs->effective_xcpus); deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, excpus); /* * Additions of remote CPUs is only allowed if those CPUs are * not allocated to other partitions and there are effective_cpus * left in the top cpuset. */ if (adding) { WARN_ON_ONCE(cpumask_intersects(tmp->addmask, subpartitions_cpus)); if (!capable(CAP_SYS_ADMIN)) cs->prs_err = PERR_ACCESS; else if (cpumask_intersects(tmp->addmask, subpartitions_cpus) || cpumask_subset(top_cpuset.effective_cpus, tmp->addmask)) cs->prs_err = PERR_NOCPUS; if (cs->prs_err) goto invalidate; } spin_lock_irq(&callback_lock); if (adding) isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask); if (deleting) isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask); /* * Need to update effective_xcpus and exclusive_cpus now as * update_sibling_cpumasks() below may iterate back to the same cs. */ cpumask_copy(cs->effective_xcpus, excpus); if (xcpus) cpumask_copy(cs->exclusive_cpus, xcpus); spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); if (adding || deleting) cpuset_force_rebuild(); /* * Propagate changes in top_cpuset's effective_cpus down the hierarchy. */ cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); return; invalidate: remote_partition_disable(cs, tmp); } /* * prstate_housekeeping_conflict - check for partition & housekeeping conflicts * @prstate: partition root state to be checked * @new_cpus: cpu mask * Return: true if there is conflict, false otherwise * * CPUs outside of boot_hk_cpus, if defined, can only be used in an * isolated partition. */ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) { if (!have_boot_isolcpus) return false; if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus)) return true; return false; } /** * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset * @cs: The cpuset that requests change in partition root state * @cmd: Partition root state change command * @newmask: Optional new cpumask for partcmd_update * @tmp: Temporary addmask and delmask * Return: 0 or a partition root state error code * * For partcmd_enable*, the cpuset is being transformed from a non-partition * root to a partition root. The effective_xcpus (cpus_allowed if * effective_xcpus not set) mask of the given cpuset will be taken away from * parent's effective_cpus. The function will return 0 if all the CPUs listed * in effective_xcpus can be granted or an error code will be returned. * * For partcmd_disable, the cpuset is being transformed from a partition * root back to a non-partition root. Any CPUs in effective_xcpus will be * given back to parent's effective_cpus. 0 will always be returned. * * For partcmd_update, if the optional newmask is specified, the cpu list is * to be changed from effective_xcpus to newmask. Otherwise, effective_xcpus is * assumed to remain the same. The cpuset should either be a valid or invalid * partition root. The partition root state may change from valid to invalid * or vice versa. An error code will be returned if transitioning from * invalid to valid violates the exclusivity rule. * * For partcmd_invalidate, the current partition will be made invalid. * * The partcmd_enable* and partcmd_disable commands are used by * update_prstate(). An error code may be returned and the caller will check * for error. * * The partcmd_update command is used by update_cpumasks_hier() with newmask * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used * by update_cpumask() with NULL newmask. In both cases, the callers won't * check for error and so partition_root_state and prs_err will be updated * directly. */ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, struct cpumask *newmask, struct tmpmasks *tmp) { struct cpuset *parent = parent_cs(cs); int adding; /* Adding cpus to parent's effective_cpus */ int deleting; /* Deleting cpus from parent's effective_cpus */ int old_prs, new_prs; int part_error = PERR_NONE; /* Partition error? */ int subparts_delta = 0; int isolcpus_updated = 0; struct cpumask *xcpus = user_xcpus(cs); bool nocpu; lockdep_assert_held(&cpuset_mutex); WARN_ON_ONCE(is_remote_partition(cs)); /* For local partition only */ /* * new_prs will only be changed for the partcmd_update and * partcmd_invalidate commands. */ adding = deleting = false; old_prs = new_prs = cs->partition_root_state; if (cmd == partcmd_invalidate) { if (is_partition_invalid(cs)) return 0; /* * Make the current partition invalid. */ if (is_partition_valid(parent)) adding = cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); if (old_prs > 0) { new_prs = -old_prs; subparts_delta--; } goto write_error; } /* * The parent must be a partition root. * The new cpumask, if present, or the current cpus_allowed must * not be empty. */ if (!is_partition_valid(parent)) { return is_partition_invalid(parent) ? PERR_INVPARENT : PERR_NOTPART; } if (!newmask && xcpus_empty(cs)) return PERR_CPUSEMPTY; nocpu = tasks_nocpu_error(parent, cs, xcpus); if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) { /* * Need to call compute_excpus() in case * exclusive_cpus not set. Sibling conflict should only happen * if exclusive_cpus isn't set. */ xcpus = tmp->delmask; if (compute_excpus(cs, xcpus)) WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus)); new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; /* * Enabling partition root is not allowed if its * effective_xcpus is empty. */ if (cpumask_empty(xcpus)) return PERR_INVCPUS; if (prstate_housekeeping_conflict(new_prs, xcpus)) return PERR_HKEEPING; if (tasks_nocpu_error(parent, cs, xcpus)) return PERR_NOCPUS; /* * This function will only be called when all the preliminary * checks have passed. At this point, the following condition * should hold. * * (cs->effective_xcpus & cpu_active_mask) ⊆ parent->effective_cpus * * Warn if it is not the case. */ cpumask_and(tmp->new_cpus, xcpus, cpu_active_mask); WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus)); deleting = true; subparts_delta++; } else if (cmd == partcmd_disable) { /* * May need to add cpus back to parent's effective_cpus * (and maybe removed from subpartitions_cpus/isolated_cpus) * for valid partition root. xcpus may contain CPUs that * shouldn't be removed from the two global cpumasks. */ if (is_partition_valid(cs)) { cpumask_copy(tmp->addmask, cs->effective_xcpus); adding = true; subparts_delta--; } new_prs = PRS_MEMBER; } else if (newmask) { /* * Empty cpumask is not allowed */ if (cpumask_empty(newmask)) { part_error = PERR_CPUSEMPTY; goto write_error; } /* Check newmask again, whether cpus are available for parent/cs */ nocpu |= tasks_nocpu_error(parent, cs, newmask); /* * partcmd_update with newmask: * * Compute add/delete mask to/from effective_cpus * * For valid partition: * addmask = exclusive_cpus & ~newmask * & parent->effective_xcpus * delmask = newmask & ~exclusive_cpus * & parent->effective_xcpus * * For invalid partition: * delmask = newmask & parent->effective_xcpus */ if (is_partition_invalid(cs)) { adding = false; deleting = cpumask_and(tmp->delmask, newmask, parent->effective_xcpus); } else { cpumask_andnot(tmp->addmask, xcpus, newmask); adding = cpumask_and(tmp->addmask, tmp->addmask, parent->effective_xcpus); cpumask_andnot(tmp->delmask, newmask, xcpus); deleting = cpumask_and(tmp->delmask, tmp->delmask, parent->effective_xcpus); } /* * The new CPUs to be removed from parent's effective CPUs * must be present. */ if (deleting) { cpumask_and(tmp->new_cpus, tmp->delmask, cpu_active_mask); WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus)); } /* * Make partition invalid if parent's effective_cpus could * become empty and there are tasks in the parent. */ if (nocpu && (!adding || !cpumask_intersects(tmp->addmask, cpu_active_mask))) { part_error = PERR_NOCPUS; deleting = false; adding = cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); } } else { /* * partcmd_update w/o newmask * * delmask = effective_xcpus & parent->effective_cpus * * This can be called from: * 1) update_cpumasks_hier() * 2) cpuset_hotplug_update_tasks() * * Check to see if it can be transitioned from valid to * invalid partition or vice versa. * * A partition error happens when parent has tasks and all * its effective CPUs will have to be distributed out. */ if (nocpu) { part_error = PERR_NOCPUS; if (is_partition_valid(cs)) adding = cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); } else if (is_partition_invalid(cs) && !cpumask_empty(xcpus) && cpumask_subset(xcpus, parent->effective_xcpus)) { struct cgroup_subsys_state *css; struct cpuset *child; bool exclusive = true; /* * Convert invalid partition to valid has to * pass the cpu exclusivity test. */ rcu_read_lock(); cpuset_for_each_child(child, css, parent) { if (child == cs) continue; if (!cpusets_are_exclusive(cs, child)) { exclusive = false; break; } } rcu_read_unlock(); if (exclusive) deleting = cpumask_and(tmp->delmask, xcpus, parent->effective_cpus); else part_error = PERR_NOTEXCL; } } write_error: if (part_error) WRITE_ONCE(cs->prs_err, part_error); if (cmd == partcmd_update) { /* * Check for possible transition between valid and invalid * partition root. */ switch (cs->partition_root_state) { case PRS_ROOT: case PRS_ISOLATED: if (part_error) { new_prs = -old_prs; subparts_delta--; } break; case PRS_INVALID_ROOT: case PRS_INVALID_ISOLATED: if (!part_error) { new_prs = -old_prs; subparts_delta++; } break; } } if (!adding && !deleting && (new_prs == old_prs)) return 0; /* * Transitioning between invalid to valid or vice versa may require * changing CS_CPU_EXCLUSIVE. In the case of partcmd_update, * validate_change() has already been successfully called and * CPU lists in cs haven't been updated yet. So defer it to later. */ if ((old_prs != new_prs) && (cmd != partcmd_update)) { int err = update_partition_exclusive_flag(cs, new_prs); if (err) return err; } /* * Change the parent's effective_cpus & effective_xcpus (top cpuset * only). * * Newly added CPUs will be removed from effective_cpus and * newly deleted ones will be added back to effective_cpus. */ spin_lock_irq(&callback_lock); if (old_prs != new_prs) { cs->partition_root_state = new_prs; if (new_prs <= 0) cs->nr_subparts = 0; } /* * Adding to parent's effective_cpus means deletion CPUs from cs * and vice versa. */ if (adding) isolcpus_updated += partition_xcpus_del(old_prs, parent, tmp->addmask); if (deleting) isolcpus_updated += partition_xcpus_add(new_prs, parent, tmp->delmask); if (is_partition_valid(parent)) { parent->nr_subparts += subparts_delta; WARN_ON_ONCE(parent->nr_subparts < 0); } spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); if ((old_prs != new_prs) && (cmd == partcmd_update)) update_partition_exclusive_flag(cs, new_prs); if (adding || deleting) { cpuset_update_tasks_cpumask(parent, tmp->addmask); update_sibling_cpumasks(parent, cs, tmp); } /* * For partcmd_update without newmask, it is being called from * cpuset_handle_hotplug(). Update the load balance flag and * scheduling domain accordingly. */ if ((cmd == partcmd_update) && !newmask) update_partition_sd_lb(cs, old_prs); notify_partition_change(cs, old_prs); return 0; } /** * compute_partition_effective_cpumask - compute effective_cpus for partition * @cs: partition root cpuset * @new_ecpus: previously computed effective_cpus to be updated * * Compute the effective_cpus of a partition root by scanning effective_xcpus * of child partition roots and excluding their effective_xcpus. * * This has the side effect of invalidating valid child partition roots, * if necessary. Since it is called from either cpuset_hotplug_update_tasks() * or update_cpumasks_hier() where parent and children are modified * successively, we don't need to call update_parent_effective_cpumask() * and the child's effective_cpus will be updated in later iterations. * * Note that rcu_read_lock() is assumed to be held. */ static void compute_partition_effective_cpumask(struct cpuset *cs, struct cpumask *new_ecpus) { struct cgroup_subsys_state *css; struct cpuset *child; bool populated = partition_is_populated(cs, NULL); /* * Check child partition roots to see if they should be * invalidated when * 1) child effective_xcpus not a subset of new * excluisve_cpus * 2) All the effective_cpus will be used up and cp * has tasks */ compute_excpus(cs, new_ecpus); cpumask_and(new_ecpus, new_ecpus, cpu_active_mask); rcu_read_lock(); cpuset_for_each_child(child, css, cs) { if (!is_partition_valid(child)) continue; /* * There shouldn't be a remote partition underneath another * partition root. */ WARN_ON_ONCE(is_remote_partition(child)); child->prs_err = 0; if (!cpumask_subset(child->effective_xcpus, cs->effective_xcpus)) child->prs_err = PERR_INVCPUS; else if (populated && cpumask_subset(new_ecpus, child->effective_xcpus)) child->prs_err = PERR_NOCPUS; if (child->prs_err) { int old_prs = child->partition_root_state; /* * Invalidate child partition */ spin_lock_irq(&callback_lock); make_partition_invalid(child); cs->nr_subparts--; child->nr_subparts = 0; spin_unlock_irq(&callback_lock); notify_partition_change(child, old_prs); continue; } cpumask_andnot(new_ecpus, new_ecpus, child->effective_xcpus); } rcu_read_unlock(); } /* * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree * @cs: the cpuset to consider * @tmp: temp variables for calculating effective_cpus & partition setup * @force: don't skip any descendant cpusets if set * * When configured cpumask is changed, the effective cpumasks of this cpuset * and all its descendants need to be updated. * * On legacy hierarchy, effective_cpus will be the same with cpu_allowed. * * Called with cpuset_mutex held */ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, bool force) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; bool need_rebuild_sched_domains = false; int old_prs, new_prs; rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, cs) { struct cpuset *parent = parent_cs(cp); bool remote = is_remote_partition(cp); bool update_parent = false; old_prs = new_prs = cp->partition_root_state; /* * For child remote partition root (!= cs), we need to call * remote_cpus_update() if effective_xcpus will be changed. * Otherwise, we can skip the whole subtree. * * remote_cpus_update() will reuse tmp->new_cpus only after * its value is being processed. */ if (remote && (cp != cs)) { compute_excpus(cp, tmp->new_cpus); if (cpumask_equal(cp->effective_xcpus, tmp->new_cpus)) { pos_css = css_rightmost_descendant(pos_css); continue; } rcu_read_unlock(); remote_cpus_update(cp, NULL, tmp->new_cpus, tmp); rcu_read_lock(); /* Remote partition may be invalidated */ new_prs = cp->partition_root_state; remote = (new_prs == old_prs); } if (remote || (is_partition_valid(parent) && is_partition_valid(cp))) compute_partition_effective_cpumask(cp, tmp->new_cpus); else compute_effective_cpumask(tmp->new_cpus, cp, parent); if (remote) goto get_css; /* Ready to update cpuset data */ /* * A partition with no effective_cpus is allowed as long as * there is no task associated with it. Call * update_parent_effective_cpumask() to check it. */ if (is_partition_valid(cp) && cpumask_empty(tmp->new_cpus)) { update_parent = true; goto update_parent_effective; } /* * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some CPUs unless * it is a partition root that has explicitly distributed * out all its CPUs. */ if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) cpumask_copy(tmp->new_cpus, parent->effective_cpus); /* * Skip the whole subtree if * 1) the cpumask remains the same, * 2) has no partition root state, * 3) force flag not set, and * 4) for v2 load balance state same as its parent. */ if (!cp->partition_root_state && !force && cpumask_equal(tmp->new_cpus, cp->effective_cpus) && (!cpuset_v2() || (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) { pos_css = css_rightmost_descendant(pos_css); continue; } update_parent_effective: /* * update_parent_effective_cpumask() should have been called * for cs already in update_cpumask(). We should also call * cpuset_update_tasks_cpumask() again for tasks in the parent * cpuset if the parent's effective_cpus changes. */ if ((cp != cs) && old_prs) { switch (parent->partition_root_state) { case PRS_ROOT: case PRS_ISOLATED: update_parent = true; break; default: /* * When parent is not a partition root or is * invalid, child partition roots become * invalid too. */ if (is_partition_valid(cp)) new_prs = -cp->partition_root_state; WRITE_ONCE(cp->prs_err, is_partition_invalid(parent) ? PERR_INVPARENT : PERR_NOTPART); break; } } get_css: if (!css_tryget_online(&cp->css)) continue; rcu_read_unlock(); if (update_parent) { update_parent_effective_cpumask(cp, partcmd_update, NULL, tmp); /* * The cpuset partition_root_state may become * invalid. Capture it. */ new_prs = cp->partition_root_state; } spin_lock_irq(&callback_lock); cpumask_copy(cp->effective_cpus, tmp->new_cpus); cp->partition_root_state = new_prs; if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) compute_excpus(cp, cp->effective_xcpus); /* * Make sure effective_xcpus is properly set for a valid * partition root. */ if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus)) cpumask_and(cp->effective_xcpus, cp->cpus_allowed, parent->effective_xcpus); else if (new_prs < 0) reset_partition_data(cp); spin_unlock_irq(&callback_lock); notify_partition_change(cp, old_prs); WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); cpuset_update_tasks_cpumask(cp, cp->effective_cpus); /* * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE * from parent if current cpuset isn't a valid partition root * and their load balance states differ. */ if (cpuset_v2() && !is_partition_valid(cp) && (is_sched_load_balance(parent) != is_sched_load_balance(cp))) { if (is_sched_load_balance(parent)) set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); else clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); } /* * On legacy hierarchy, if the effective cpumask of any non- * empty cpuset is changed, we need to rebuild sched domains. * On default hierarchy, the cpuset needs to be a partition * root as well. */ if (!cpumask_empty(cp->cpus_allowed) && is_sched_load_balance(cp) && (!cpuset_v2() || is_partition_valid(cp))) need_rebuild_sched_domains = true; rcu_read_lock(); css_put(&cp->css); } rcu_read_unlock(); if (need_rebuild_sched_domains) cpuset_force_rebuild(); } /** * update_sibling_cpumasks - Update siblings cpumasks * @parent: Parent cpuset * @cs: Current cpuset * @tmp: Temp variables */ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, struct tmpmasks *tmp) { struct cpuset *sibling; struct cgroup_subsys_state *pos_css; lockdep_assert_held(&cpuset_mutex); /* * Check all its siblings and call update_cpumasks_hier() * if their effective_cpus will need to be changed. * * It is possible a change in parent's effective_cpus * due to a change in a child partition's effective_xcpus will impact * its siblings even if they do not inherit parent's effective_cpus * directly. * * The update_cpumasks_hier() function may sleep. So we have to * release the RCU read lock before calling it. */ rcu_read_lock(); cpuset_for_each_child(sibling, pos_css, parent) { if (sibling == cs) continue; if (!is_partition_valid(sibling)) { compute_effective_cpumask(tmp->new_cpus, sibling, parent); if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus)) continue; } else if (is_remote_partition(sibling)) { /* * Change in a sibling cpuset won't affect a remote * partition root. */ continue; } if (!css_tryget_online(&sibling->css)) continue; rcu_read_unlock(); update_cpumasks_hier(sibling, tmp, false); rcu_read_lock(); css_put(&sibling->css); } rcu_read_unlock(); } static int parse_cpuset_cpulist(const char *buf, struct cpumask *out_mask) { int retval; retval = cpulist_parse(buf, out_mask); if (retval < 0) return retval; if (!cpumask_subset(out_mask, top_cpuset.cpus_allowed)) return -EINVAL; return 0; } /** * validate_partition - Validate a cpuset partition configuration * @cs: The cpuset to validate * @trialcs: The trial cpuset containing proposed configuration changes * * If any validation check fails, the appropriate error code is set in the * cpuset's prs_err field. * * Return: PRS error code (0 if valid, non-zero error code if invalid) */ static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *trialcs) { struct cpuset *parent = parent_cs(cs); if (cs_is_member(trialcs)) return PERR_NONE; if (cpumask_empty(trialcs->effective_xcpus)) return PERR_INVCPUS; if (prstate_housekeeping_conflict(trialcs->partition_root_state, trialcs->effective_xcpus)) return PERR_HKEEPING; if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) return PERR_NOCPUS; return PERR_NONE; } static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs, struct tmpmasks *tmp) { int retval; struct cpuset *parent = parent_cs(cs); retval = validate_change(cs, trialcs); if ((retval == -EINVAL) && cpuset_v2()) { struct cgroup_subsys_state *css; struct cpuset *cp; /* * The -EINVAL error code indicates that partition sibling * CPU exclusivity rule has been violated. We still allow * the cpumask change to proceed while invalidating the * partition. However, any conflicting sibling partitions * have to be marked as invalid too. */ trialcs->prs_err = PERR_NOTEXCL; rcu_read_lock(); cpuset_for_each_child(cp, css, parent) { struct cpumask *xcpus = user_xcpus(trialcs); if (is_partition_valid(cp) && cpumask_intersects(xcpus, cp->effective_xcpus)) { rcu_read_unlock(); update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp); rcu_read_lock(); } } rcu_read_unlock(); retval = 0; } return retval; } /** * partition_cpus_change - Handle partition state changes due to CPU mask updates * @cs: The target cpuset being modified * @trialcs: The trial cpuset containing proposed configuration changes * @tmp: Temporary masks for intermediate calculations * * This function handles partition state transitions triggered by CPU mask changes. * CPU modifications may cause a partition to be disabled or require state updates. */ static void partition_cpus_change(struct cpuset *cs, struct cpuset *trialcs, struct tmpmasks *tmp) { enum prs_errcode prs_err; if (cs_is_member(cs)) return; prs_err = validate_partition(cs, trialcs); if (prs_err) trialcs->prs_err = cs->prs_err = prs_err; if (is_remote_partition(cs)) { if (trialcs->prs_err) remote_partition_disable(cs, tmp); else remote_cpus_update(cs, trialcs->exclusive_cpus, trialcs->effective_xcpus, tmp); } else { if (trialcs->prs_err) update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, tmp); else update_parent_effective_cpumask(cs, partcmd_update, trialcs->effective_xcpus, tmp); } } /** * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it * @cs: the cpuset to consider * @trialcs: trial cpuset * @buf: buffer of cpu numbers written to this cpuset */ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { int retval; struct tmpmasks tmp; bool force = false; int old_prs = cs->partition_root_state; retval = parse_cpuset_cpulist(buf, trialcs->cpus_allowed); if (retval < 0) return retval; /* Nothing to do if the cpus didn't change */ if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) return 0; if (alloc_tmpmasks(&tmp)) return -ENOMEM; compute_trialcs_excpus(trialcs, cs); trialcs->prs_err = PERR_NONE; retval = cpus_allowed_validate_change(cs, trialcs, &tmp); if (retval < 0) goto out_free; /* * Check all the descendants in update_cpumasks_hier() if * effective_xcpus is to be changed. */ force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); partition_cpus_change(cs, trialcs, &tmp); spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); if ((old_prs > 0) && !is_partition_valid(cs)) reset_partition_data(cs); spin_unlock_irq(&callback_lock); /* effective_cpus/effective_xcpus will be updated here */ update_cpumasks_hier(cs, &tmp, force); /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ if (cs->partition_root_state) update_partition_sd_lb(cs, old_prs); out_free: free_tmpmasks(&tmp); return retval; } /** * update_exclusive_cpumask - update the exclusive_cpus mask of a cpuset * @cs: the cpuset to consider * @trialcs: trial cpuset * @buf: buffer of cpu numbers written to this cpuset * * The tasks' cpumask will be updated if cs is a valid partition root. */ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { int retval; struct tmpmasks tmp; bool force = false; int old_prs = cs->partition_root_state; retval = parse_cpuset_cpulist(buf, trialcs->exclusive_cpus); if (retval < 0) return retval; /* Nothing to do if the CPUs didn't change */ if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus)) return 0; /* * Reject the change if there is exclusive CPUs conflict with * the siblings. */ if (compute_trialcs_excpus(trialcs, cs)) return -EINVAL; /* * Check all the descendants in update_cpumasks_hier() if * effective_xcpus is to be changed. */ force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); retval = validate_change(cs, trialcs); if (retval) return retval; if (alloc_tmpmasks(&tmp)) return -ENOMEM; trialcs->prs_err = PERR_NONE; partition_cpus_change(cs, trialcs, &tmp); spin_lock_irq(&callback_lock); cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus); cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); if ((old_prs > 0) && !is_partition_valid(cs)) reset_partition_data(cs); spin_unlock_irq(&callback_lock); /* * Call update_cpumasks_hier() to update effective_cpus/effective_xcpus * of the subtree when it is a valid partition root or effective_xcpus * is updated. */ if (is_partition_valid(cs) || force) update_cpumasks_hier(cs, &tmp, force); /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ if (cs->partition_root_state) update_partition_sd_lb(cs, old_prs); free_tmpmasks(&tmp); return 0; } /* * Migrate memory region from one set of nodes to another. This is * performed asynchronously as it can be called from process migration path * holding locks involved in process management. All mm migrations are * performed in the queued order and can be waited for by flushing * cpuset_migrate_mm_wq. */ struct cpuset_migrate_mm_work { struct work_struct work; struct mm_struct *mm; nodemask_t from; nodemask_t to; }; static void cpuset_migrate_mm_workfn(struct work_struct *work) { struct cpuset_migrate_mm_work *mwork = container_of(work, struct cpuset_migrate_mm_work, work); /* on a wq worker, no need to worry about %current's mems_allowed */ do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL); mmput(mwork->mm); kfree(mwork); } static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to) { struct cpuset_migrate_mm_work *mwork; if (nodes_equal(*from, *to)) { mmput(mm); return; } mwork = kzalloc(sizeof(*mwork), GFP_KERNEL); if (mwork) { mwork->mm = mm; mwork->from = *from; mwork->to = *to; INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn); queue_work(cpuset_migrate_mm_wq, &mwork->work); } else { mmput(mm); } } static void flush_migrate_mm_task_workfn(struct callback_head *head) { flush_workqueue(cpuset_migrate_mm_wq); kfree(head); } static void schedule_flush_migrate_mm(void) { struct callback_head *flush_cb; flush_cb = kzalloc(sizeof(struct callback_head), GFP_KERNEL); if (!flush_cb) return; init_task_work(flush_cb, flush_migrate_mm_task_workfn); if (task_work_add(current, flush_cb, TWA_RESUME)) kfree(flush_cb); } /* * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy * @tsk: the task to change * @newmems: new nodes that the task will be set * * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed * and rebind an eventual tasks' mempolicy. If the task is allocating in * parallel, it might temporarily see an empty intersection, which results in * a seqlock check and retry before OOM or allocation failure. */ static void cpuset_change_task_nodemask(struct task_struct *tsk, nodemask_t *newmems) { task_lock(tsk); local_irq_disable(); write_seqcount_begin(&tsk->mems_allowed_seq); nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); mpol_rebind_task(tsk, newmems); tsk->mems_allowed = *newmems; write_seqcount_end(&tsk->mems_allowed_seq); local_irq_enable(); task_unlock(tsk); } static void *cpuset_being_rebound; /** * cpuset_update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. * @cs: the cpuset in which each task's mems_allowed mask needs to be changed * * Iterate through each task of @cs updating its mems_allowed to the * effective cpuset's. As this function is called with cpuset_mutex held, * cpuset membership stays stable. */ void cpuset_update_tasks_nodemask(struct cpuset *cs) { static nodemask_t newmems; /* protected by cpuset_mutex */ struct css_task_iter it; struct task_struct *task; cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ guarantee_online_mems(cs, &newmems); /* * The mpol_rebind_mm() call takes mmap_lock, which we couldn't * take while holding tasklist_lock. Forks can happen - the * mpol_dup() cpuset_being_rebound check will catch such forks, * and rebind their vma mempolicies too. Because we still hold * the global cpuset_mutex, we know that no other rebind effort * will be contending for the global variable cpuset_being_rebound. * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. */ css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) { struct mm_struct *mm; bool migrate; cpuset_change_task_nodemask(task, &newmems); mm = get_task_mm(task); if (!mm) continue; migrate = is_memory_migrate(cs); mpol_rebind_mm(mm, &cs->mems_allowed); if (migrate) cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); else mmput(mm); } css_task_iter_end(&it); /* * All the tasks' nodemasks have been updated, update * cs->old_mems_allowed. */ cs->old_mems_allowed = newmems; /* We're done rebinding vmas to this cpuset's new mems_allowed. */ cpuset_being_rebound = NULL; } /* * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree * @cs: the cpuset to consider * @new_mems: a temp variable for calculating new effective_mems * * When configured nodemask is changed, the effective nodemasks of this cpuset * and all its descendants need to be updated. * * On legacy hierarchy, effective_mems will be the same with mems_allowed. * * Called with cpuset_mutex held */ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, cs) { struct cpuset *parent = parent_cs(cp); nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems); /* * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some MEMs. */ if (is_in_v2_mode() && nodes_empty(*new_mems)) *new_mems = parent->effective_mems; /* Skip the whole subtree if the nodemask remains the same. */ if (nodes_equal(*new_mems, cp->effective_mems)) { pos_css = css_rightmost_descendant(pos_css); continue; } if (!css_tryget_online(&cp->css)) continue; rcu_read_unlock(); spin_lock_irq(&callback_lock); cp->effective_mems = *new_mems; spin_unlock_irq(&callback_lock); WARN_ON(!is_in_v2_mode() && !nodes_equal(cp->mems_allowed, cp->effective_mems)); cpuset_update_tasks_nodemask(cp); rcu_read_lock(); css_put(&cp->css); } rcu_read_unlock(); } /* * Handle user request to change the 'mems' memory placement * of a cpuset. Needs to validate the request, update the * cpusets mems_allowed, and for each task in the cpuset, * update mems_allowed and rebind task's mempolicy and any vma * mempolicies and if the cpuset is marked 'memory_migrate', * migrate the tasks pages to the new memory. * * Call with cpuset_mutex held. May take callback_lock during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, * lock each such tasks mm->mmap_lock, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. */ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { int retval; /* * An empty mems_allowed is ok iff there are no tasks in the cpuset. * The validate_change() call ensures that cpusets with tasks have memory. */ retval = nodelist_parse(buf, trialcs->mems_allowed); if (retval < 0) goto done; if (!nodes_subset(trialcs->mems_allowed, top_cpuset.mems_allowed)) { retval = -EINVAL; goto done; } if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) { retval = 0; /* Too easy - nothing to do */ goto done; } retval = validate_change(cs, trialcs); if (retval < 0) goto done; check_insane_mems_config(&trialcs->mems_allowed); spin_lock_irq(&callback_lock); cs->mems_allowed = trialcs->mems_allowed; spin_unlock_irq(&callback_lock); /* use trialcs->mems_allowed as a temp variable */ update_nodemasks_hier(cs, &trialcs->mems_allowed); done: return retval; } bool current_cpuset_is_being_rebound(void) { bool ret; rcu_read_lock(); ret = task_cs(current) == cpuset_being_rebound; rcu_read_unlock(); return ret; } /* * cpuset_update_flag - read a 0 or a 1 in a file and update associated flag * bit: the bit to update (see cpuset_flagbits_t) * cs: the cpuset to update * turning_on: whether the flag is being set or cleared * * Call with cpuset_mutex held. */ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) { struct cpuset *trialcs; int balance_flag_changed; int spread_flag_changed; int err; trialcs = dup_or_alloc_cpuset(cs); if (!trialcs) return -ENOMEM; if (turning_on) set_bit(bit, &trialcs->flags); else clear_bit(bit, &trialcs->flags); err = validate_change(cs, trialcs); if (err < 0) goto out; balance_flag_changed = (is_sched_load_balance(cs) != is_sched_load_balance(trialcs)); spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) || (is_spread_page(cs) != is_spread_page(trialcs))); spin_lock_irq(&callback_lock); cs->flags = trialcs->flags; spin_unlock_irq(&callback_lock); if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) { if (cpuset_v2()) cpuset_force_rebuild(); else rebuild_sched_domains_locked(); } if (spread_flag_changed) cpuset1_update_tasks_flags(cs); out: free_cpuset(trialcs); return err; } /** * update_prstate - update partition_root_state * @cs: the cpuset to update * @new_prs: new partition root state * Return: 0 if successful, != 0 if error * * Call with cpuset_mutex held. */ static int update_prstate(struct cpuset *cs, int new_prs) { int err = PERR_NONE, old_prs = cs->partition_root_state; struct cpuset *parent = parent_cs(cs); struct tmpmasks tmpmask; bool isolcpus_updated = false; if (old_prs == new_prs) return 0; /* * Treat a previously invalid partition root as if it is a "member". */ if (new_prs && is_partition_invalid(cs)) old_prs = PRS_MEMBER; if (alloc_tmpmasks(&tmpmask)) return -ENOMEM; err = update_partition_exclusive_flag(cs, new_prs); if (err) goto out; if (!old_prs) { /* * cpus_allowed and exclusive_cpus cannot be both empty. */ if (xcpus_empty(cs)) { err = PERR_CPUSEMPTY; goto out; } /* * We don't support the creation of a new local partition with * a remote partition underneath it. This unsupported * setting can happen only if parent is the top_cpuset because * a remote partition cannot be created underneath an existing * local or remote partition. */ if ((parent == &top_cpuset) && cpumask_intersects(cs->exclusive_cpus, subpartitions_cpus)) { err = PERR_REMOTE; goto out; } /* * If parent is valid partition, enable local partiion. * Otherwise, enable a remote partition. */ if (is_partition_valid(parent)) { enum partition_cmd cmd = (new_prs == PRS_ROOT) ? partcmd_enable : partcmd_enablei; err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask); } else { err = remote_partition_enable(cs, new_prs, &tmpmask); } } else if (old_prs && new_prs) { /* * A change in load balance state only, no change in cpumasks. * Need to update isolated_cpus. */ isolcpus_updated = true; } else { /* * Switching back to member is always allowed even if it * disables child partitions. */ if (is_remote_partition(cs)) remote_partition_disable(cs, &tmpmask); else update_parent_effective_cpumask(cs, partcmd_disable, NULL, &tmpmask); /* * Invalidation of child partitions will be done in * update_cpumasks_hier(). */ } out: /* * Make partition invalid & disable CS_CPU_EXCLUSIVE if an error * happens. */ if (err) { new_prs = -new_prs; update_partition_exclusive_flag(cs, new_prs); } spin_lock_irq(&callback_lock); cs->partition_root_state = new_prs; WRITE_ONCE(cs->prs_err, err); if (!is_partition_valid(cs)) reset_partition_data(cs); else if (isolcpus_updated) isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus); spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); /* Force update if switching back to member & update effective_xcpus */ update_cpumasks_hier(cs, &tmpmask, !new_prs); /* A newly created partition must have effective_xcpus set */ WARN_ON_ONCE(!old_prs && (new_prs > 0) && cpumask_empty(cs->effective_xcpus)); /* Update sched domains and load balance flag */ update_partition_sd_lb(cs, old_prs); notify_partition_change(cs, old_prs); if (force_sd_rebuild) rebuild_sched_domains_locked(); free_tmpmasks(&tmpmask); return 0; } static struct cpuset *cpuset_attach_old_cs; /* * Check to see if a cpuset can accept a new task * For v1, cpus_allowed and mems_allowed can't be empty. * For v2, effective_cpus can't be empty. * Note that in v1, effective_cpus = cpus_allowed. */ static int cpuset_can_attach_check(struct cpuset *cs) { if (cpumask_empty(cs->effective_cpus) || (!is_in_v2_mode() && nodes_empty(cs->mems_allowed))) return -ENOSPC; return 0; } static void reset_migrate_dl_data(struct cpuset *cs) { cs->nr_migrate_dl_tasks = 0; cs->sum_migrate_dl_bw = 0; } /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ static int cpuset_can_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; struct cpuset *cs, *oldcs; struct task_struct *task; bool cpus_updated, mems_updated; int ret; /* used later by cpuset_attach() */ cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); oldcs = cpuset_attach_old_cs; cs = css_cs(css); mutex_lock(&cpuset_mutex); /* Check to see if task is allowed in the cpuset */ ret = cpuset_can_attach_check(cs); if (ret) goto out_unlock; cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus); mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); cgroup_taskset_for_each(task, css, tset) { ret = task_can_attach(task); if (ret) goto out_unlock; /* * Skip rights over task check in v2 when nothing changes, * migration permission derives from hierarchy ownership in * cgroup_procs_write_permission()). */ if (!cpuset_v2() || (cpus_updated || mems_updated)) { ret = security_task_setscheduler(task); if (ret) goto out_unlock; } if (dl_task(task)) { cs->nr_migrate_dl_tasks++; cs->sum_migrate_dl_bw += task->dl.dl_bw; } } if (!cs->nr_migrate_dl_tasks) goto out_success; if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) { int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); if (unlikely(cpu >= nr_cpu_ids)) { reset_migrate_dl_data(cs); ret = -EINVAL; goto out_unlock; } ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw); if (ret) { reset_migrate_dl_data(cs); goto out_unlock; } } out_success: /* * Mark attach is in progress. This makes validate_change() fail * changes which zero cpus/mems_allowed. */ cs->attach_in_progress++; out_unlock: mutex_unlock(&cpuset_mutex); return ret; } static void cpuset_cancel_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; struct cpuset *cs; cgroup_taskset_first(tset, &css); cs = css_cs(css); mutex_lock(&cpuset_mutex); dec_attach_in_progress_locked(cs); if (cs->nr_migrate_dl_tasks) { int cpu = cpumask_any(cs->effective_cpus); dl_bw_free(cpu, cs->sum_migrate_dl_bw); reset_migrate_dl_data(cs); } mutex_unlock(&cpuset_mutex); } /* * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task() * but we can't allocate it dynamically there. Define it global and * allocate from cpuset_init(). */ static cpumask_var_t cpus_attach; static nodemask_t cpuset_attach_nodemask_to; static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) { lockdep_assert_held(&cpuset_mutex); if (cs != &top_cpuset) guarantee_active_cpus(task, cpus_attach); else cpumask_andnot(cpus_attach, task_cpu_possible_mask(task), subpartitions_cpus); /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here */ WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); cpuset1_update_task_spread_flags(cs, task); } static void cpuset_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct task_struct *leader; struct cgroup_subsys_state *css; struct cpuset *cs; struct cpuset *oldcs = cpuset_attach_old_cs; bool cpus_updated, mems_updated; bool queue_task_work = false; cgroup_taskset_first(tset, &css); cs = css_cs(css); lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ mutex_lock(&cpuset_mutex); cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus); mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); /* * In the default hierarchy, enabling cpuset in the child cgroups * will trigger a number of cpuset_attach() calls with no change * in effective cpus and mems. In that case, we can optimize out * by skipping the task iteration and update. */ if (cpuset_v2() && !cpus_updated && !mems_updated) { cpuset_attach_nodemask_to = cs->effective_mems; goto out; } guarantee_online_mems(cs, &cpuset_attach_nodemask_to); cgroup_taskset_for_each(task, css, tset) cpuset_attach_task(cs, task); /* * Change mm for all threadgroup leaders. This is expensive and may * sleep and should be moved outside migration path proper. Skip it * if there is no change in effective_mems and CS_MEMORY_MIGRATE is * not set. */ cpuset_attach_nodemask_to = cs->effective_mems; if (!is_memory_migrate(cs) && !mems_updated) goto out; cgroup_taskset_for_each_leader(leader, css, tset) { struct mm_struct *mm = get_task_mm(leader); if (mm) { mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); /* * old_mems_allowed is the same with mems_allowed * here, except if this task is being moved * automatically due to hotplug. In that case * @mems_allowed has been updated and is empty, so * @old_mems_allowed is the right nodesets that we * migrate mm from. */ if (is_memory_migrate(cs)) { cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, &cpuset_attach_nodemask_to); queue_task_work = true; } else mmput(mm); } } out: if (queue_task_work) schedule_flush_migrate_mm(); cs->old_mems_allowed = cpuset_attach_nodemask_to; if (cs->nr_migrate_dl_tasks) { cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks; oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks; reset_migrate_dl_data(cs); } dec_attach_in_progress_locked(cs); mutex_unlock(&cpuset_mutex); } /* * Common handling for a write to a "cpus" or "mems" file. */ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cpuset *cs = css_cs(of_css(of)); struct cpuset *trialcs; int retval = -ENODEV; /* root is read-only */ if (cs == &top_cpuset) return -EACCES; buf = strstrip(buf); cpuset_full_lock(); if (!is_cpuset_online(cs)) goto out_unlock; trialcs = dup_or_alloc_cpuset(cs); if (!trialcs) { retval = -ENOMEM; goto out_unlock; } switch (of_cft(of)->private) { case FILE_CPULIST: retval = update_cpumask(cs, trialcs, buf); break; case FILE_EXCLUSIVE_CPULIST: retval = update_exclusive_cpumask(cs, trialcs, buf); break; case FILE_MEMLIST: retval = update_nodemask(cs, trialcs, buf); break; default: retval = -EINVAL; break; } free_cpuset(trialcs); if (force_sd_rebuild) rebuild_sched_domains_locked(); out_unlock: cpuset_full_unlock(); if (of_cft(of)->private == FILE_MEMLIST) schedule_flush_migrate_mm(); return retval ?: nbytes; } /* * These ascii lists should be read in a single call, by using a user * buffer large enough to hold the entire map. If read in smaller * chunks, there is no guarantee of atomicity. Since the display format * used, list of ranges of sequential numbers, is variable length, * and since these maps can change value dynamically, one could read * gibberish by doing partial reads while a list was changing. */ int cpuset_common_seq_show(struct seq_file *sf, void *v) { struct cpuset *cs = css_cs(seq_css(sf)); cpuset_filetype_t type = seq_cft(sf)->private; int ret = 0; spin_lock_irq(&callback_lock); switch (type) { case FILE_CPULIST: seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed)); break; case FILE_MEMLIST: seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed)); break; case FILE_EFFECTIVE_CPULIST: seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus)); break; case FILE_EFFECTIVE_MEMLIST: seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems)); break; case FILE_EXCLUSIVE_CPULIST: seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->exclusive_cpus)); break; case FILE_EFFECTIVE_XCPULIST: seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_xcpus)); break; case FILE_SUBPARTS_CPULIST: seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus)); break; case FILE_ISOLATED_CPULIST: seq_printf(sf, "%*pbl\n", cpumask_pr_args(isolated_cpus)); break; default: ret = -EINVAL; } spin_unlock_irq(&callback_lock); return ret; } static int cpuset_partition_show(struct seq_file *seq, void *v) { struct cpuset *cs = css_cs(seq_css(seq)); const char *err, *type = NULL; switch (cs->partition_root_state) { case PRS_ROOT: seq_puts(seq, "root\n"); break; case PRS_ISOLATED: seq_puts(seq, "isolated\n"); break; case PRS_MEMBER: seq_puts(seq, "member\n"); break; case PRS_INVALID_ROOT: type = "root"; fallthrough; case PRS_INVALID_ISOLATED: if (!type) type = "isolated"; err = perr_strings[READ_ONCE(cs->prs_err)]; if (err) seq_printf(seq, "%s invalid (%s)\n", type, err); else seq_printf(seq, "%s invalid\n", type); break; } return 0; } static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cpuset *cs = css_cs(of_css(of)); int val; int retval = -ENODEV; buf = strstrip(buf); if (!strcmp(buf, "root")) val = PRS_ROOT; else if (!strcmp(buf, "member")) val = PRS_MEMBER; else if (!strcmp(buf, "isolated")) val = PRS_ISOLATED; else return -EINVAL; cpuset_full_lock(); if (is_cpuset_online(cs)) retval = update_prstate(cs, val); cpuset_full_unlock(); return retval ?: nbytes; } /* * This is currently a minimal set for the default hierarchy. It can be * expanded later on by migrating more features and control files from v1. */ static struct cftype dfl_files[] = { { .name = "cpus", .seq_show = cpuset_common_seq_show, .write = cpuset_write_resmask, .max_write_len = (100U + 6 * NR_CPUS), .private = FILE_CPULIST, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "mems", .seq_show = cpuset_common_seq_show, .write = cpuset_write_resmask, .max_write_len = (100U + 6 * MAX_NUMNODES), .private = FILE_MEMLIST, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "cpus.effective", .seq_show = cpuset_common_seq_show, .private = FILE_EFFECTIVE_CPULIST, }, { .name = "mems.effective", .seq_show = cpuset_common_seq_show, .private = FILE_EFFECTIVE_MEMLIST, }, { .name = "cpus.partition", .seq_show = cpuset_partition_show, .write = cpuset_partition_write, .private = FILE_PARTITION_ROOT, .flags = CFTYPE_NOT_ON_ROOT, .file_offset = offsetof(struct cpuset, partition_file), }, { .name = "cpus.exclusive", .seq_show = cpuset_common_seq_show, .write = cpuset_write_resmask, .max_write_len = (100U + 6 * NR_CPUS), .private = FILE_EXCLUSIVE_CPULIST, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "cpus.exclusive.effective", .seq_show = cpuset_common_seq_show, .private = FILE_EFFECTIVE_XCPULIST, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "cpus.subpartitions", .seq_show = cpuset_common_seq_show, .private = FILE_SUBPARTS_CPULIST, .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG, }, { .name = "cpus.isolated", .seq_show = cpuset_common_seq_show, .private = FILE_ISOLATED_CPULIST, .flags = CFTYPE_ONLY_ON_ROOT, }, { } /* terminate */ }; /** * cpuset_css_alloc - Allocate a cpuset css * @parent_css: Parent css of the control group that the new cpuset will be * part of * Return: cpuset css on success, -ENOMEM on failure. * * Allocate and initialize a new cpuset css, for non-NULL @parent_css, return * top cpuset css otherwise. */ static struct cgroup_subsys_state * cpuset_css_alloc(struct cgroup_subsys_state *parent_css) { struct cpuset *cs; if (!parent_css) return &top_cpuset.css; cs = dup_or_alloc_cpuset(NULL); if (!cs) return ERR_PTR(-ENOMEM); __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; INIT_LIST_HEAD(&cs->remote_sibling); /* Set CS_MEMORY_MIGRATE for default hierarchy */ if (cpuset_v2()) __set_bit(CS_MEMORY_MIGRATE, &cs->flags); return &cs->css; } static int cpuset_css_online(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); struct cpuset *parent = parent_cs(cs); struct cpuset *tmp_cs; struct cgroup_subsys_state *pos_css; if (!parent) return 0; cpuset_full_lock(); if (is_spread_page(parent)) set_bit(CS_SPREAD_PAGE, &cs->flags); if (is_spread_slab(parent)) set_bit(CS_SPREAD_SLAB, &cs->flags); /* * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated */ if (cpuset_v2() && !is_sched_load_balance(parent)) clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpuset_inc(); spin_lock_irq(&callback_lock); if (is_in_v2_mode()) { cpumask_copy(cs->effective_cpus, parent->effective_cpus); cs->effective_mems = parent->effective_mems; } spin_unlock_irq(&callback_lock); if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) goto out_unlock; /* * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is * set. This flag handling is implemented in cgroup core for * historical reasons - the flag may be specified during mount. * * Currently, if any sibling cpusets have exclusive cpus or mem, we * refuse to clone the configuration - thereby refusing the task to * be entered, and as a result refusing the sys_unshare() or * clone() which initiated it. If this becomes a problem for some * users who wish to allow that scenario, then this could be * changed to grant parent->cpus_allowed-sibling_cpus_exclusive * (and likewise for mems) to the new cgroup. */ rcu_read_lock(); cpuset_for_each_child(tmp_cs, pos_css, parent) { if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { rcu_read_unlock(); goto out_unlock; } } rcu_read_unlock(); spin_lock_irq(&callback_lock); cs->mems_allowed = parent->mems_allowed; cs->effective_mems = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); cpumask_copy(cs->effective_cpus, parent->cpus_allowed); spin_unlock_irq(&callback_lock); out_unlock: cpuset_full_unlock(); return 0; } /* * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which * will call rebuild_sched_domains_locked(). That is not needed * in the default hierarchy where only changes in partition * will cause repartitioning. */ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); cpuset_full_lock(); if (!cpuset_v2() && is_sched_load_balance(cs)) cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); cpuset_dec(); cpuset_full_unlock(); } /* * If a dying cpuset has the 'cpus.partition' enabled, turn it off by * changing it back to member to free its exclusive CPUs back to the pool to * be used by other online cpusets. */ static void cpuset_css_killed(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); cpuset_full_lock(); /* Reset valid partition back to member */ if (is_partition_valid(cs)) update_prstate(cs, PRS_MEMBER); cpuset_full_unlock(); } static void cpuset_css_free(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); free_cpuset(cs); } static void cpuset_bind(struct cgroup_subsys_state *root_css) { mutex_lock(&cpuset_mutex); spin_lock_irq(&callback_lock); if (is_in_v2_mode()) { cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); cpumask_copy(top_cpuset.effective_xcpus, cpu_possible_mask); top_cpuset.mems_allowed = node_possible_map; } else { cpumask_copy(top_cpuset.cpus_allowed, top_cpuset.effective_cpus); top_cpuset.mems_allowed = top_cpuset.effective_mems; } spin_unlock_irq(&callback_lock); mutex_unlock(&cpuset_mutex); } /* * In case the child is cloned into a cpuset different from its parent, * additional checks are done to see if the move is allowed. */ static int cpuset_can_fork(struct task_struct *task, struct css_set *cset) { struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]); bool same_cs; int ret; rcu_read_lock(); same_cs = (cs == task_cs(current)); rcu_read_unlock(); if (same_cs) return 0; lockdep_assert_held(&cgroup_mutex); mutex_lock(&cpuset_mutex); /* Check to see if task is allowed in the cpuset */ ret = cpuset_can_attach_check(cs); if (ret) goto out_unlock; ret = task_can_attach(task); if (ret) goto out_unlock; ret = security_task_setscheduler(task); if (ret) goto out_unlock; /* * Mark attach is in progress. This makes validate_change() fail * changes which zero cpus/mems_allowed. */ cs->attach_in_progress++; out_unlock: mutex_unlock(&cpuset_mutex); return ret; } static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset) { struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]); bool same_cs; rcu_read_lock(); same_cs = (cs == task_cs(current)); rcu_read_unlock(); if (same_cs) return; dec_attach_in_progress(cs); } /* * Make sure the new task conform to the current state of its parent, * which could have been changed by cpuset just after it inherits the * state from the parent and before it sits on the cgroup's task list. */ static void cpuset_fork(struct task_struct *task) { struct cpuset *cs; bool same_cs; rcu_read_lock(); cs = task_cs(task); same_cs = (cs == task_cs(current)); rcu_read_unlock(); if (same_cs) { if (cs == &top_cpuset) return; set_cpus_allowed_ptr(task, current->cpus_ptr); task->mems_allowed = current->mems_allowed; return; } /* CLONE_INTO_CGROUP */ mutex_lock(&cpuset_mutex); guarantee_online_mems(cs, &cpuset_attach_nodemask_to); cpuset_attach_task(cs, task); dec_attach_in_progress_locked(cs); mutex_unlock(&cpuset_mutex); } struct cgroup_subsys cpuset_cgrp_subsys = { .css_alloc = cpuset_css_alloc, .css_online = cpuset_css_online, .css_offline = cpuset_css_offline, .css_killed = cpuset_css_killed, .css_free = cpuset_css_free, .can_attach = cpuset_can_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, .bind = cpuset_bind, .can_fork = cpuset_can_fork, .cancel_fork = cpuset_cancel_fork, .fork = cpuset_fork, #ifdef CONFIG_CPUSETS_V1 .legacy_cftypes = cpuset1_files, #endif .dfl_cftypes = dfl_files, .early_init = true, .threaded = true, }; /** * cpuset_init - initialize cpusets at system boot * * Description: Initialize top_cpuset **/ int __init cpuset_init(void) { BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL)); cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); cpumask_setall(top_cpuset.effective_cpus); cpumask_setall(top_cpuset.effective_xcpus); cpumask_setall(top_cpuset.exclusive_cpus); nodes_setall(top_cpuset.effective_mems); fmeter_init(&top_cpuset.fmeter); INIT_LIST_HEAD(&remote_children); BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); have_boot_isolcpus = housekeeping_enabled(HK_TYPE_DOMAIN); if (have_boot_isolcpus) { BUG_ON(!alloc_cpumask_var(&boot_hk_cpus, GFP_KERNEL)); cpumask_copy(boot_hk_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN)); cpumask_andnot(isolated_cpus, cpu_possible_mask, boot_hk_cpus); } return 0; } static void hotplug_update_tasks(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated) { /* A partition root is allowed to have empty effective cpus */ if (cpumask_empty(new_cpus) && !is_partition_valid(cs)) cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus); if (nodes_empty(*new_mems)) *new_mems = parent_cs(cs)->effective_mems; spin_lock_irq(&callback_lock); cpumask_copy(cs->effective_cpus, new_cpus); cs->effective_mems = *new_mems; spin_unlock_irq(&callback_lock); if (cpus_updated) cpuset_update_tasks_cpumask(cs, new_cpus); if (mems_updated) cpuset_update_tasks_nodemask(cs); } void cpuset_force_rebuild(void) { force_sd_rebuild = true; } /** * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug * @cs: cpuset in interest * @tmp: the tmpmasks structure pointer * * Compare @cs's cpu and mem masks against top_cpuset and if some have gone * offline, update @cs accordingly. If @cs ends up with no CPU or memory, * all its tasks are moved to the nearest ancestor with both resources. */ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) { static cpumask_t new_cpus; static nodemask_t new_mems; bool cpus_updated; bool mems_updated; bool remote; int partcmd = -1; struct cpuset *parent; retry: wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); mutex_lock(&cpuset_mutex); /* * We have raced with task attaching. We wait until attaching * is finished, so we won't attach a task to an empty cpuset. */ if (cs->attach_in_progress) { mutex_unlock(&cpuset_mutex); goto retry; } parent = parent_cs(cs); compute_effective_cpumask(&new_cpus, cs, parent); nodes_and(new_mems, cs->mems_allowed, parent->effective_mems); if (!tmp || !cs->partition_root_state) goto update_tasks; /* * Compute effective_cpus for valid partition root, may invalidate * child partition roots if necessary. */ remote = is_remote_partition(cs); if (remote || (is_partition_valid(cs) && is_partition_valid(parent))) compute_partition_effective_cpumask(cs, &new_cpus); if (remote && cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)) { cs->prs_err = PERR_HOTPLUG; remote_partition_disable(cs, tmp); compute_effective_cpumask(&new_cpus, cs, parent); remote = false; } /* * Force the partition to become invalid if either one of * the following conditions hold: * 1) empty effective cpus but not valid empty partition. * 2) parent is invalid or doesn't grant any cpus to child * partitions. */ if (is_local_partition(cs) && (!is_partition_valid(parent) || tasks_nocpu_error(parent, cs, &new_cpus))) partcmd = partcmd_invalidate; /* * On the other hand, an invalid partition root may be transitioned * back to a regular one with a non-empty effective xcpus. */ else if (is_partition_valid(parent) && is_partition_invalid(cs) && !cpumask_empty(cs->effective_xcpus)) partcmd = partcmd_update; if (partcmd >= 0) { update_parent_effective_cpumask(cs, partcmd, NULL, tmp); if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) { compute_partition_effective_cpumask(cs, &new_cpus); cpuset_force_rebuild(); } } update_tasks: cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); mems_updated = !nodes_equal(new_mems, cs->effective_mems); if (!cpus_updated && !mems_updated) goto unlock; /* Hotplug doesn't affect this cpuset */ if (mems_updated) check_insane_mems_config(&new_mems); if (is_in_v2_mode()) hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); else cpuset1_hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); unlock: mutex_unlock(&cpuset_mutex); } /** * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always * synchronized to cpu_active_mask and N_MEMORY, which is necessary in * order to make cpusets transparent (of no affect) on systems that are * actively using CPU hotplug but making no active use of cpusets. * * Non-root cpusets are only affected by offlining. If any CPUs or memory * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on * all descendants. * * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. * * CPU / memory hotplug is handled synchronously. */ static void cpuset_handle_hotplug(void) { static cpumask_t new_cpus; static nodemask_t new_mems; bool cpus_updated, mems_updated; bool on_dfl = is_in_v2_mode(); struct tmpmasks tmp, *ptmp = NULL; if (on_dfl && !alloc_tmpmasks(&tmp)) ptmp = &tmp; lockdep_assert_cpus_held(); mutex_lock(&cpuset_mutex); /* fetch the available cpus/mems and find out which changed how */ cpumask_copy(&new_cpus, cpu_active_mask); new_mems = node_states[N_MEMORY]; /* * If subpartitions_cpus is populated, it is likely that the check * below will produce a false positive on cpus_updated when the cpu * list isn't changed. It is extra work, but it is better to be safe. */ cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus) || !cpumask_empty(subpartitions_cpus); mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); /* For v1, synchronize cpus_allowed to cpu_active_mask */ if (cpus_updated) { cpuset_force_rebuild(); spin_lock_irq(&callback_lock); if (!on_dfl) cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); /* * Make sure that CPUs allocated to child partitions * do not show up in effective_cpus. If no CPU is left, * we clear the subpartitions_cpus & let the child partitions * fight for the CPUs again. */ if (!cpumask_empty(subpartitions_cpus)) { if (cpumask_subset(&new_cpus, subpartitions_cpus)) { top_cpuset.nr_subparts = 0; cpumask_clear(subpartitions_cpus); } else { cpumask_andnot(&new_cpus, &new_cpus, subpartitions_cpus); } } cpumask_copy(top_cpuset.effective_cpus, &new_cpus); spin_unlock_irq(&callback_lock); /* we don't mess with cpumasks of tasks in top_cpuset */ } /* synchronize mems_allowed to N_MEMORY */ if (mems_updated) { spin_lock_irq(&callback_lock); if (!on_dfl) top_cpuset.mems_allowed = new_mems; top_cpuset.effective_mems = new_mems; spin_unlock_irq(&callback_lock); cpuset_update_tasks_nodemask(&top_cpuset); } mutex_unlock(&cpuset_mutex); /* if cpus or mems changed, we need to propagate to descendants */ if (cpus_updated || mems_updated) { struct cpuset *cs; struct cgroup_subsys_state *pos_css; rcu_read_lock(); cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { if (cs == &top_cpuset || !css_tryget_online(&cs->css)) continue; rcu_read_unlock(); cpuset_hotplug_update_tasks(cs, ptmp); rcu_read_lock(); css_put(&cs->css); } rcu_read_unlock(); } /* rebuild sched domains if necessary */ if (force_sd_rebuild) rebuild_sched_domains_cpuslocked(); free_tmpmasks(ptmp); } void cpuset_update_active_cpus(void) { /* * We're inside cpu hotplug critical region which usually nests * inside cgroup synchronization. Bounce actual hotplug processing * to a work item to avoid reverse locking order. */ cpuset_handle_hotplug(); } /* * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. * Call this routine anytime after node_states[N_MEMORY] changes. * See cpuset_update_active_cpus() for CPU hotplug handling. */ static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { cpuset_handle_hotplug(); return NOTIFY_OK; } /** * cpuset_init_smp - initialize cpus_allowed * * Description: Finish top cpuset after cpu, node maps are initialized */ void __init cpuset_init_smp(void) { /* * cpus_allowd/mems_allowed set to v2 values in the initial * cpuset_bind() call will be reset to v1 values in another * cpuset_bind() call when v1 cpuset is mounted. */ top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); top_cpuset.effective_mems = node_states[N_MEMORY]; hotplug_node_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI); cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); BUG_ON(!cpuset_migrate_mm_wq); } /** * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. * * Description: Returns the cpumask_var_t cpus_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty * subset of cpu_active_mask, even if this means going outside the * tasks cpuset, except when the task is in the top cpuset. **/ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) { unsigned long flags; struct cpuset *cs; spin_lock_irqsave(&callback_lock, flags); cs = task_cs(tsk); if (cs != &top_cpuset) guarantee_active_cpus(tsk, pmask); /* * Tasks in the top cpuset won't get update to their cpumasks * when a hotplug online/offline event happens. So we include all * offline cpus in the allowed cpu list. */ if ((cs == &top_cpuset) || cpumask_empty(pmask)) { const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); /* * We first exclude cpus allocated to partitions. If there is no * allowable online cpu left, we fall back to all possible cpus. */ cpumask_andnot(pmask, possible_mask, subpartitions_cpus); if (!cpumask_intersects(pmask, cpu_active_mask)) cpumask_copy(pmask, possible_mask); } spin_unlock_irqrestore(&callback_lock, flags); } /** * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe. * @tsk: pointer to task_struct with which the scheduler is struggling * * Description: In the case that the scheduler cannot find an allowed cpu in * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy * mode however, this value is the same as task_cs(tsk)->effective_cpus, * which will not contain a sane cpumask during cases such as cpu hotplugging. * This is the absolute last resort for the scheduler and it is only used if * _every_ other avenue has been traveled. * * Returns true if the affinity of @tsk was changed, false otherwise. **/ bool cpuset_cpus_allowed_fallback(struct task_struct *tsk) { const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); const struct cpumask *cs_mask; bool changed = false; rcu_read_lock(); cs_mask = task_cs(tsk)->cpus_allowed; if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) { do_set_cpus_allowed(tsk, cs_mask); changed = true; } rcu_read_unlock(); /* * We own tsk->cpus_allowed, nobody can change it under us. * * But we used cs && cs->cpus_allowed lockless and thus can * race with cgroup_attach_task() or update_cpumask() and get * the wrong tsk->cpus_allowed. However, both cases imply the * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr() * which takes task_rq_lock(). * * If we are called after it dropped the lock we must see all * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary * set any mask even if it is not right from task_cs() pov, * the pending set_cpus_allowed_ptr() will fix things. * * select_fallback_rq() will fix things ups and set cpu_possible_mask * if required. */ return changed; } void __init cpuset_init_current_mems_allowed(void) { nodes_setall(current->mems_allowed); } /** * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset. * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed. * * Description: Returns the nodemask_t mems_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty * subset of node_states[N_MEMORY], even if this means going outside the * tasks cpuset. **/ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) { nodemask_t mask; unsigned long flags; spin_lock_irqsave(&callback_lock, flags); guarantee_online_mems(task_cs(tsk), &mask); spin_unlock_irqrestore(&callback_lock, flags); return mask; } /** * cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed * @nodemask: the nodemask to be checked * * Are any of the nodes in the nodemask allowed in current->mems_allowed? */ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) { return nodes_intersects(*nodemask, current->mems_allowed); } /* * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or * mem_hardwall ancestor to the specified cpuset. Call holding * callback_lock. If no ancestor is mem_exclusive or mem_hardwall * (an unusual configuration), then returns the root cpuset. */ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) { while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) cs = parent_cs(cs); return cs; } /* * cpuset_current_node_allowed - Can current task allocate on a memory node? * @node: is this an allowed node? * @gfp_mask: memory allocation flags * * If we're in interrupt, yes, we can always allocate. If @node is set in * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this * node is set in the nearest hardwalled cpuset ancestor to current's cpuset, * yes. If current has access to memory reserves as an oom victim, yes. * Otherwise, no. * * GFP_USER allocations are marked with the __GFP_HARDWALL bit, * and do not allow allocations outside the current tasks cpuset * unless the task has been OOM killed. * GFP_KERNEL allocations are not so marked, so can escape to the * nearest enclosing hardwalled ancestor cpuset. * * Scanning up parent cpusets requires callback_lock. The * __alloc_pages() routine only calls here with __GFP_HARDWALL bit * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the * current tasks mems_allowed came up empty on the first pass over * the zonelist. So only GFP_KERNEL allocations, if all nodes in the * cpuset are short of memory, might require taking the callback_lock. * * The first call here from mm/page_alloc:get_page_from_freelist() * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, * so no allocation on a node outside the cpuset is allowed (unless * in interrupt, of course). * * The second pass through get_page_from_freelist() doesn't even call * here for GFP_ATOMIC calls. For those calls, the __alloc_pages() * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set * in alloc_flags. That logic and the checks below have the combined * affect that: * in_interrupt - any node ok (current task context irrelevant) * GFP_ATOMIC - any node ok * tsk_is_oom_victim - any node ok * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. */ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask) { struct cpuset *cs; /* current cpuset ancestors */ bool allowed; /* is allocation in zone z allowed? */ unsigned long flags; if (in_interrupt()) return true; if (node_isset(node, current->mems_allowed)) return true; /* * Allow tasks that have access to memory reserves because they have * been OOM killed to get memory anywhere. */ if (unlikely(tsk_is_oom_victim(current))) return true; if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ return false; if (current->flags & PF_EXITING) /* Let dying task have memory */ return true; /* Not hardwall and node outside mems_allowed: scan up cpusets */ spin_lock_irqsave(&callback_lock, flags); cs = nearest_hardwall_ancestor(task_cs(current)); allowed = node_isset(node, cs->mems_allowed); spin_unlock_irqrestore(&callback_lock, flags); return allowed; } bool cpuset_node_allowed(struct cgroup *cgroup, int nid) { struct cgroup_subsys_state *css; struct cpuset *cs; bool allowed; /* * In v1, mem_cgroup and cpuset are unlikely in the same hierarchy * and mems_allowed is likely to be empty even if we could get to it, * so return true to avoid taking a global lock on the empty check. */ if (!cpuset_v2()) return true; css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys); if (!css) return true; /* * Normally, accessing effective_mems would require the cpuset_mutex * or callback_lock - but node_isset is atomic and the reference * taken via cgroup_get_e_css is sufficient to protect css. * * Since this interface is intended for use by migration paths, we * relax locking here to avoid taking global locks - while accepting * there may be rare scenarios where the result may be innaccurate. * * Reclaim and migration are subject to these same race conditions, and * cannot make strong isolation guarantees, so this is acceptable. */ cs = container_of(css, struct cpuset, css); allowed = node_isset(nid, cs->effective_mems); css_put(css); return allowed; } /** * cpuset_spread_node() - On which node to begin search for a page * @rotor: round robin rotor * * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for * tasks in a cpuset with is_spread_page or is_spread_slab set), * and if the memory allocation used cpuset_mem_spread_node() * to determine on which node to start looking, as it will for * certain page cache or slab cache pages such as used for file * system buffers and inode caches, then instead of starting on the * local node to look for a free page, rather spread the starting * node around the tasks mems_allowed nodes. * * We don't have to worry about the returned node being offline * because "it can't happen", and even if it did, it would be ok. * * The routines calling guarantee_online_mems() are careful to * only set nodes in task->mems_allowed that are online. So it * should not be possible for the following code to return an * offline node. But if it did, that would be ok, as this routine * is not returning the node where the allocation must be, only * the node where the search should start. The zonelist passed to * __alloc_pages() will include all nodes. If the slab allocator * is passed an offline node, it will fall back to the local node. * See kmem_cache_alloc_node(). */ static int cpuset_spread_node(int *rotor) { return *rotor = next_node_in(*rotor, current->mems_allowed); } /** * cpuset_mem_spread_node() - On which node to begin search for a file page */ int cpuset_mem_spread_node(void) { if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE) current->cpuset_mem_spread_rotor = node_random(&current->mems_allowed); return cpuset_spread_node(&current->cpuset_mem_spread_rotor); } /** * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's? * @tsk1: pointer to task_struct of some task. * @tsk2: pointer to task_struct of some other task. * * Description: Return true if @tsk1's mems_allowed intersects the * mems_allowed of @tsk2. Used by the OOM killer to determine if * one of the task's memory usage might impact the memory available * to the other. **/ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, const struct task_struct *tsk2) { return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); } /** * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed * * Description: Prints current's name, cpuset name, and cached copy of its * mems_allowed to the kernel log. */ void cpuset_print_current_mems_allowed(void) { struct cgroup *cgrp; rcu_read_lock(); cgrp = task_cs(current)->css.cgroup; pr_cont(",cpuset="); pr_cont_cgroup_name(cgrp); pr_cont(",mems_allowed=%*pbl", nodemask_pr_args(&current->mems_allowed)); rcu_read_unlock(); } /* Display task mems_allowed in /proc/<pid>/status file. */ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { seq_printf(m, "Mems_allowed:\t%*pb\n", nodemask_pr_args(&task->mems_allowed)); seq_printf(m, "Mems_allowed_list:\t%*pbl\n", nodemask_pr_args(&task->mems_allowed)); }
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 // SPDX-License-Identifier: LGPL-2.1 /* * SPNEGO upcall management for CIFS * * Copyright (c) 2007 Red Hat, Inc. * Author(s): Jeff Layton (jlayton@redhat.com) * */ #include <linux/list.h> #include <linux/slab.h> #include <linux/string.h> #include <keys/user-type.h> #include <linux/key-type.h> #include <linux/keyctl.h> #include <linux/inet.h> #include "cifsglob.h" #include "cifs_spnego.h" #include "cifs_debug.h" #include "cifsproto.h" static const struct cred *spnego_cred; /* create a new cifs key */ static int cifs_spnego_key_instantiate(struct key *key, struct key_preparsed_payload *prep) { char *payload = kmemdup(prep->data, prep->datalen, GFP_KERNEL); if (!payload) return -ENOMEM; /* attach the data */ key->payload.data[0] = payload; return 0; } static void cifs_spnego_key_destroy(struct key *key) { kfree(key->payload.data[0]); } /* * keytype for CIFS spnego keys */ struct key_type cifs_spnego_key_type = { .name = "cifs.spnego", .instantiate = cifs_spnego_key_instantiate, .destroy = cifs_spnego_key_destroy, .describe = user_describe, }; /* length of longest version string e.g. strlen("ver=0xFF") */ #define MAX_VER_STR_LEN 8 /* length of longest security mechanism name, eg in future could have * strlen(";sec=ntlmsspi") */ #define MAX_MECH_STR_LEN 13 /* strlen of ";host=" */ #define HOST_KEY_LEN 6 /* strlen of ";ip4=" or ";ip6=" */ #define IP_KEY_LEN 5 /* strlen of ";uid=0x" */ #define UID_KEY_LEN 7 /* strlen of ";creduid=0x" */ #define CREDUID_KEY_LEN 11 /* strlen of ";user=" */ #define USER_KEY_LEN 6 /* strlen of ";pid=0x" */ #define PID_KEY_LEN 7 /* strlen of ";upcall_target=" */ #define UPCALL_TARGET_KEY_LEN 15 /* get a key struct with a SPNEGO security blob, suitable for session setup */ struct key * cifs_get_spnego_key(struct cifs_ses *sesInfo, struct TCP_Server_Info *server) { struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr; struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr; char *description, *dp; size_t desc_len; struct key *spnego_key; const char *hostname = server->hostname; const struct cred *saved_cred; /* length of fields (with semicolons): ver=0xyz ip4=ipaddress host=hostname sec=mechanism uid=0xFF user=username */ desc_len = MAX_VER_STR_LEN + HOST_KEY_LEN + strlen(hostname) + IP_KEY_LEN + INET6_ADDRSTRLEN + MAX_MECH_STR_LEN + UID_KEY_LEN + (sizeof(uid_t) * 2) + CREDUID_KEY_LEN + (sizeof(uid_t) * 2) + PID_KEY_LEN + (sizeof(pid_t) * 2) + 1; if (sesInfo->user_name) desc_len += USER_KEY_LEN + strlen(sesInfo->user_name); if (sesInfo->upcall_target == UPTARGET_MOUNT) desc_len += UPCALL_TARGET_KEY_LEN + 5; // strlen("mount") else desc_len += UPCALL_TARGET_KEY_LEN + 3; // strlen("app") spnego_key = ERR_PTR(-ENOMEM); description = kzalloc(desc_len, GFP_KERNEL); if (description == NULL) goto out; dp = description; /* start with version and hostname portion of UNC string */ spnego_key = ERR_PTR(-EINVAL); dp += sprintf(dp, "ver=0x%x;host=%s;", CIFS_SPNEGO_UPCALL_VERSION, hostname); /* add the server address */ if (server->dstaddr.ss_family == AF_INET) dp += sprintf(dp, "ip4=%pI4", &sa->sin_addr); else if (server->dstaddr.ss_family == AF_INET6) dp += sprintf(dp, "ip6=%pI6", &sa6->sin6_addr); else goto out; /* for now, only sec=krb5 and sec=mskrb5 and iakerb are valid */ if (server->sec_kerberos) dp += sprintf(dp, ";sec=krb5"); else if (server->sec_mskerberos) dp += sprintf(dp, ";sec=mskrb5"); else if (server->sec_iakerb) dp += sprintf(dp, ";sec=iakerb"); else { cifs_dbg(VFS, "unknown or missing server auth type, use krb5\n"); dp += sprintf(dp, ";sec=krb5"); } dp += sprintf(dp, ";uid=0x%x", from_kuid_munged(&init_user_ns, sesInfo->linux_uid)); dp += sprintf(dp, ";creduid=0x%x", from_kuid_munged(&init_user_ns, sesInfo->cred_uid)); if (sesInfo->user_name) dp += sprintf(dp, ";user=%s", sesInfo->user_name); dp += sprintf(dp, ";pid=0x%x", current->pid); if (sesInfo->upcall_target == UPTARGET_MOUNT) dp += sprintf(dp, ";upcall_target=mount"); else dp += sprintf(dp, ";upcall_target=app"); cifs_dbg(FYI, "key description = %s\n", description); saved_cred = override_creds(spnego_cred); spnego_key = request_key(&cifs_spnego_key_type, description, ""); revert_creds(saved_cred); #ifdef CONFIG_CIFS_DEBUG2 if (cifsFYI && !IS_ERR(spnego_key)) { struct cifs_spnego_msg *msg = spnego_key->payload.data[0]; cifs_dump_mem("SPNEGO reply blob:", msg->data, min(1024U, msg->secblob_len + msg->sesskey_len)); } #endif /* CONFIG_CIFS_DEBUG2 */ out: kfree(description); return spnego_key; } int init_cifs_spnego(void) { struct cred *cred; struct key *keyring; int ret; cifs_dbg(FYI, "Registering the %s key type\n", cifs_spnego_key_type.name); /* * Create an override credential set with special thread keyring for * spnego upcalls. */ cred = prepare_kernel_cred(&init_task); if (!cred) return -ENOMEM; keyring = keyring_alloc(".cifs_spnego", GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ, KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto failed_put_cred; } ret = register_key_type(&cifs_spnego_key_type); if (ret < 0) goto failed_put_key; /* * instruct request_key() to use this special keyring as a cache for * the results it looks up */ set_bit(KEY_FLAG_ROOT_CAN_CLEAR, &keyring->flags); cred->thread_keyring = keyring; cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; spnego_cred = cred; cifs_dbg(FYI, "cifs spnego keyring: %d\n", key_serial(keyring)); return 0; failed_put_key: key_put(keyring); failed_put_cred: put_cred(cred); return ret; } void exit_cifs_spnego(void) { key_revoke(spnego_cred->thread_keyring); unregister_key_type(&cifs_spnego_key_type); put_cred(spnego_cred); cifs_dbg(FYI, "Unregistered %s key type\n", cifs_spnego_key_type.name); }
8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 // SPDX-License-Identifier: GPL-2.0-only /* * * Author Karsten Keil <kkeil@novell.com> * * Copyright 2008 by Karsten Keil <kkeil@novell.com> */ #include <linux/slab.h> #include <linux/mISDNif.h> #include <linux/kthread.h> #include <linux/sched.h> #include <linux/sched/cputime.h> #include <linux/signal.h> #include "core.h" static u_int *debug; static inline void _queue_message(struct mISDNstack *st, struct sk_buff *skb) { struct mISDNhead *hh = mISDN_HEAD_P(skb); if (*debug & DEBUG_QUEUE_FUNC) printk(KERN_DEBUG "%s prim(%x) id(%x) %p\n", __func__, hh->prim, hh->id, skb); skb_queue_tail(&st->msgq, skb); if (likely(!test_bit(mISDN_STACK_STOPPED, &st->status))) { test_and_set_bit(mISDN_STACK_WORK, &st->status); wake_up_interruptible(&st->workq); } } static int mISDN_queue_message(struct mISDNchannel *ch, struct sk_buff *skb) { _queue_message(ch->st, skb); return 0; } static struct mISDNchannel * get_channel4id(struct mISDNstack *st, u_int id) { struct mISDNchannel *ch; mutex_lock(&st->lmutex); list_for_each_entry(ch, &st->layer2, list) { if (id == ch->nr) goto unlock; } ch = NULL; unlock: mutex_unlock(&st->lmutex); return ch; } static void send_socklist(struct mISDN_sock_list *sl, struct sk_buff *skb) { struct sock *sk; struct sk_buff *cskb = NULL; read_lock(&sl->lock); sk_for_each(sk, &sl->head) { if (sk->sk_state != MISDN_BOUND) continue; if (!cskb) cskb = skb_copy(skb, GFP_ATOMIC); if (!cskb) { printk(KERN_WARNING "%s no skb\n", __func__); break; } if (!sock_queue_rcv_skb(sk, cskb)) cskb = NULL; } read_unlock(&sl->lock); dev_kfree_skb(cskb); } static void send_layer2(struct mISDNstack *st, struct sk_buff *skb) { struct sk_buff *cskb; struct mISDNhead *hh = mISDN_HEAD_P(skb); struct mISDNchannel *ch; int ret; if (!st) return; mutex_lock(&st->lmutex); if ((hh->id & MISDN_ID_ADDR_MASK) == MISDN_ID_ANY) { /* L2 for all */ list_for_each_entry(ch, &st->layer2, list) { if (list_is_last(&ch->list, &st->layer2)) { cskb = skb; skb = NULL; } else { cskb = skb_copy(skb, GFP_KERNEL); } if (cskb) { ret = ch->send(ch, cskb); if (ret) { if (*debug & DEBUG_SEND_ERR) printk(KERN_DEBUG "%s ch%d prim(%x) addr(%x)" " err %d\n", __func__, ch->nr, hh->prim, ch->addr, ret); dev_kfree_skb(cskb); } } else { printk(KERN_WARNING "%s ch%d addr %x no mem\n", __func__, ch->nr, ch->addr); goto out; } } } else { list_for_each_entry(ch, &st->layer2, list) { if ((hh->id & MISDN_ID_ADDR_MASK) == ch->addr) { ret = ch->send(ch, skb); if (!ret) skb = NULL; goto out; } } ret = st->dev->teimgr->ctrl(st->dev->teimgr, CHECK_DATA, skb); if (!ret) skb = NULL; else if (*debug & DEBUG_SEND_ERR) printk(KERN_DEBUG "%s mgr prim(%x) err %d\n", __func__, hh->prim, ret); } out: mutex_unlock(&st->lmutex); dev_kfree_skb(skb); } static inline int send_msg_to_layer(struct mISDNstack *st, struct sk_buff *skb) { struct mISDNhead *hh = mISDN_HEAD_P(skb); struct mISDNchannel *ch; int lm; lm = hh->prim & MISDN_LAYERMASK; if (*debug & DEBUG_QUEUE_FUNC) printk(KERN_DEBUG "%s prim(%x) id(%x) %p\n", __func__, hh->prim, hh->id, skb); if (lm == 0x1) { if (!hlist_empty(&st->l1sock.head)) { __net_timestamp(skb); send_socklist(&st->l1sock, skb); } return st->layer1->send(st->layer1, skb); } else if (lm == 0x2) { if (!hlist_empty(&st->l1sock.head)) send_socklist(&st->l1sock, skb); send_layer2(st, skb); return 0; } else if (lm == 0x4) { ch = get_channel4id(st, hh->id); if (ch) return ch->send(ch, skb); else printk(KERN_WARNING "%s: dev(%s) prim(%x) id(%x) no channel\n", __func__, dev_name(&st->dev->dev), hh->prim, hh->id); } else if (lm == 0x8) { WARN_ON(lm == 0x8); ch = get_channel4id(st, hh->id); if (ch) return ch->send(ch, skb); else printk(KERN_WARNING "%s: dev(%s) prim(%x) id(%x) no channel\n", __func__, dev_name(&st->dev->dev), hh->prim, hh->id); } else { /* broadcast not handled yet */ printk(KERN_WARNING "%s: dev(%s) prim %x not delivered\n", __func__, dev_name(&st->dev->dev), hh->prim); } return -ESRCH; } static void do_clear_stack(struct mISDNstack *st) { } static int mISDNStackd(void *data) { struct mISDNstack *st = data; #ifdef MISDN_MSG_STATS u64 utime, stime; #endif int err = 0; sigfillset(&current->blocked); if (*debug & DEBUG_MSG_THREAD) printk(KERN_DEBUG "mISDNStackd %s started\n", dev_name(&st->dev->dev)); if (st->notify != NULL) { complete(st->notify); st->notify = NULL; } for (;;) { struct sk_buff *skb; if (unlikely(test_bit(mISDN_STACK_STOPPED, &st->status))) { test_and_clear_bit(mISDN_STACK_WORK, &st->status); test_and_clear_bit(mISDN_STACK_RUNNING, &st->status); } else test_and_set_bit(mISDN_STACK_RUNNING, &st->status); while (test_bit(mISDN_STACK_WORK, &st->status)) { skb = skb_dequeue(&st->msgq); if (!skb) { test_and_clear_bit(mISDN_STACK_WORK, &st->status); /* test if a race happens */ skb = skb_dequeue(&st->msgq); if (!skb) continue; test_and_set_bit(mISDN_STACK_WORK, &st->status); } #ifdef MISDN_MSG_STATS st->msg_cnt++; #endif err = send_msg_to_layer(st, skb); if (unlikely(err)) { if (*debug & DEBUG_SEND_ERR) printk(KERN_DEBUG "%s: %s prim(%x) id(%x) " "send call(%d)\n", __func__, dev_name(&st->dev->dev), mISDN_HEAD_PRIM(skb), mISDN_HEAD_ID(skb), err); dev_kfree_skb(skb); continue; } if (unlikely(test_bit(mISDN_STACK_STOPPED, &st->status))) { test_and_clear_bit(mISDN_STACK_WORK, &st->status); test_and_clear_bit(mISDN_STACK_RUNNING, &st->status); break; } } if (test_bit(mISDN_STACK_CLEARING, &st->status)) { test_and_set_bit(mISDN_STACK_STOPPED, &st->status); test_and_clear_bit(mISDN_STACK_RUNNING, &st->status); do_clear_stack(st); test_and_clear_bit(mISDN_STACK_CLEARING, &st->status); test_and_set_bit(mISDN_STACK_RESTART, &st->status); } if (test_and_clear_bit(mISDN_STACK_RESTART, &st->status)) { test_and_clear_bit(mISDN_STACK_STOPPED, &st->status); test_and_set_bit(mISDN_STACK_RUNNING, &st->status); if (!skb_queue_empty(&st->msgq)) test_and_set_bit(mISDN_STACK_WORK, &st->status); } if (test_bit(mISDN_STACK_ABORT, &st->status)) break; if (st->notify != NULL) { complete(st->notify); st->notify = NULL; } #ifdef MISDN_MSG_STATS st->sleep_cnt++; #endif test_and_clear_bit(mISDN_STACK_ACTIVE, &st->status); wait_event_interruptible(st->workq, (st->status & mISDN_STACK_ACTION_MASK)); if (*debug & DEBUG_MSG_THREAD) printk(KERN_DEBUG "%s: %s wake status %08lx\n", __func__, dev_name(&st->dev->dev), st->status); test_and_set_bit(mISDN_STACK_ACTIVE, &st->status); test_and_clear_bit(mISDN_STACK_WAKEUP, &st->status); if (test_bit(mISDN_STACK_STOPPED, &st->status)) { test_and_clear_bit(mISDN_STACK_RUNNING, &st->status); #ifdef MISDN_MSG_STATS st->stopped_cnt++; #endif } } #ifdef MISDN_MSG_STATS printk(KERN_DEBUG "mISDNStackd daemon for %s proceed %d " "msg %d sleep %d stopped\n", dev_name(&st->dev->dev), st->msg_cnt, st->sleep_cnt, st->stopped_cnt); task_cputime(st->thread, &utime, &stime); printk(KERN_DEBUG "mISDNStackd daemon for %s utime(%llu) stime(%llu)\n", dev_name(&st->dev->dev), utime, stime); printk(KERN_DEBUG "mISDNStackd daemon for %s nvcsw(%ld) nivcsw(%ld)\n", dev_name(&st->dev->dev), st->thread->nvcsw, st->thread->nivcsw); printk(KERN_DEBUG "mISDNStackd daemon for %s killed now\n", dev_name(&st->dev->dev)); #endif test_and_set_bit(mISDN_STACK_KILLED, &st->status); test_and_clear_bit(mISDN_STACK_RUNNING, &st->status); test_and_clear_bit(mISDN_STACK_ACTIVE, &st->status); test_and_clear_bit(mISDN_STACK_ABORT, &st->status); skb_queue_purge(&st->msgq); st->thread = NULL; if (st->notify != NULL) { complete(st->notify); st->notify = NULL; } return 0; } static int l1_receive(struct mISDNchannel *ch, struct sk_buff *skb) { if (!ch->st) return -ENODEV; __net_timestamp(skb); _queue_message(ch->st, skb); return 0; } void set_channel_address(struct mISDNchannel *ch, u_int sapi, u_int tei) { ch->addr = sapi | (tei << 8); } void __add_layer2(struct mISDNchannel *ch, struct mISDNstack *st) { list_add_tail(&ch->list, &st->layer2); } void add_layer2(struct mISDNchannel *ch, struct mISDNstack *st) { mutex_lock(&st->lmutex); __add_layer2(ch, st); mutex_unlock(&st->lmutex); } static int st_own_ctrl(struct mISDNchannel *ch, u_int cmd, void *arg) { if (!ch->st || !ch->st->layer1) return -EINVAL; return ch->st->layer1->ctrl(ch->st->layer1, cmd, arg); } int create_stack(struct mISDNdevice *dev) { struct mISDNstack *newst; int err; DECLARE_COMPLETION_ONSTACK(done); newst = kzalloc(sizeof(struct mISDNstack), GFP_KERNEL); if (!newst) { printk(KERN_ERR "kmalloc mISDN_stack failed\n"); return -ENOMEM; } newst->dev = dev; INIT_LIST_HEAD(&newst->layer2); INIT_HLIST_HEAD(&newst->l1sock.head); rwlock_init(&newst->l1sock.lock); init_waitqueue_head(&newst->workq); skb_queue_head_init(&newst->msgq); mutex_init(&newst->lmutex); dev->D.st = newst; err = create_teimanager(dev); if (err) { printk(KERN_ERR "kmalloc teimanager failed\n"); kfree(newst); return err; } dev->teimgr->peer = &newst->own; dev->teimgr->recv = mISDN_queue_message; dev->teimgr->st = newst; newst->layer1 = &dev->D; dev->D.recv = l1_receive; dev->D.peer = &newst->own; newst->own.st = newst; newst->own.ctrl = st_own_ctrl; newst->own.send = mISDN_queue_message; newst->own.recv = mISDN_queue_message; if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: st(%s)\n", __func__, dev_name(&newst->dev->dev)); newst->notify = &done; newst->thread = kthread_run(mISDNStackd, (void *)newst, "mISDN_%s", dev_name(&newst->dev->dev)); if (IS_ERR(newst->thread)) { err = PTR_ERR(newst->thread); printk(KERN_ERR "mISDN:cannot create kernel thread for %s (%d)\n", dev_name(&newst->dev->dev), err); delete_teimanager(dev->teimgr); kfree(newst); } else wait_for_completion(&done); return err; } int connect_layer1(struct mISDNdevice *dev, struct mISDNchannel *ch, u_int protocol, struct sockaddr_mISDN *adr) { struct mISDN_sock *msk = container_of(ch, struct mISDN_sock, ch); struct channel_req rq; int err; if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: %s proto(%x) adr(%d %d %d %d)\n", __func__, dev_name(&dev->dev), protocol, adr->dev, adr->channel, adr->sapi, adr->tei); switch (protocol) { case ISDN_P_NT_S0: case ISDN_P_NT_E1: case ISDN_P_TE_S0: case ISDN_P_TE_E1: ch->recv = mISDN_queue_message; ch->peer = &dev->D.st->own; ch->st = dev->D.st; rq.protocol = protocol; rq.adr.channel = adr->channel; err = dev->D.ctrl(&dev->D, OPEN_CHANNEL, &rq); printk(KERN_DEBUG "%s: ret %d (dev %d)\n", __func__, err, dev->id); if (err) return err; write_lock_bh(&dev->D.st->l1sock.lock); sk_add_node(&msk->sk, &dev->D.st->l1sock.head); write_unlock_bh(&dev->D.st->l1sock.lock); break; default: return -ENOPROTOOPT; } return 0; } int connect_Bstack(struct mISDNdevice *dev, struct mISDNchannel *ch, u_int protocol, struct sockaddr_mISDN *adr) { struct channel_req rq, rq2; int pmask, err; struct Bprotocol *bp; if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: %s proto(%x) adr(%d %d %d %d)\n", __func__, dev_name(&dev->dev), protocol, adr->dev, adr->channel, adr->sapi, adr->tei); ch->st = dev->D.st; pmask = 1 << (protocol & ISDN_P_B_MASK); if (pmask & dev->Bprotocols) { rq.protocol = protocol; rq.adr = *adr; err = dev->D.ctrl(&dev->D, OPEN_CHANNEL, &rq); if (err) return err; ch->recv = rq.ch->send; ch->peer = rq.ch; rq.ch->recv = ch->send; rq.ch->peer = ch; rq.ch->st = dev->D.st; } else { bp = get_Bprotocol4mask(pmask); if (!bp) return -ENOPROTOOPT; rq2.protocol = protocol; rq2.adr = *adr; rq2.ch = ch; err = bp->create(&rq2); if (err) return err; ch->recv = rq2.ch->send; ch->peer = rq2.ch; rq2.ch->st = dev->D.st; rq.protocol = rq2.protocol; rq.adr = *adr; err = dev->D.ctrl(&dev->D, OPEN_CHANNEL, &rq); if (err) { rq2.ch->ctrl(rq2.ch, CLOSE_CHANNEL, NULL); return err; } rq2.ch->recv = rq.ch->send; rq2.ch->peer = rq.ch; rq.ch->recv = rq2.ch->send; rq.ch->peer = rq2.ch; rq.ch->st = dev->D.st; } ch->protocol = protocol; ch->nr = rq.ch->nr; return 0; } int create_l2entity(struct mISDNdevice *dev, struct mISDNchannel *ch, u_int protocol, struct sockaddr_mISDN *adr) { struct channel_req rq; int err; if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: %s proto(%x) adr(%d %d %d %d)\n", __func__, dev_name(&dev->dev), protocol, adr->dev, adr->channel, adr->sapi, adr->tei); rq.protocol = ISDN_P_TE_S0; if (dev->Dprotocols & (1 << ISDN_P_TE_E1)) rq.protocol = ISDN_P_TE_E1; switch (protocol) { case ISDN_P_LAPD_NT: rq.protocol = ISDN_P_NT_S0; if (dev->Dprotocols & (1 << ISDN_P_NT_E1)) rq.protocol = ISDN_P_NT_E1; fallthrough; case ISDN_P_LAPD_TE: ch->recv = mISDN_queue_message; ch->peer = &dev->D.st->own; ch->st = dev->D.st; rq.adr.channel = 0; err = dev->D.ctrl(&dev->D, OPEN_CHANNEL, &rq); printk(KERN_DEBUG "%s: ret 1 %d\n", __func__, err); if (err) break; rq.protocol = protocol; rq.adr = *adr; rq.ch = ch; err = dev->teimgr->ctrl(dev->teimgr, OPEN_CHANNEL, &rq); printk(KERN_DEBUG "%s: ret 2 %d\n", __func__, err); if (!err) { if ((protocol == ISDN_P_LAPD_NT) && !rq.ch) break; add_layer2(rq.ch, dev->D.st); rq.ch->recv = mISDN_queue_message; rq.ch->peer = &dev->D.st->own; rq.ch->ctrl(rq.ch, OPEN_CHANNEL, NULL); /* can't fail */ } break; default: err = -EPROTONOSUPPORT; } return err; } void delete_channel(struct mISDNchannel *ch) { struct mISDN_sock *msk = container_of(ch, struct mISDN_sock, ch); struct mISDNchannel *pch; if (!ch->st) { printk(KERN_WARNING "%s: no stack\n", __func__); return; } if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: st(%s) protocol(%x)\n", __func__, dev_name(&ch->st->dev->dev), ch->protocol); if (ch->protocol >= ISDN_P_B_START) { if (ch->peer) { ch->peer->ctrl(ch->peer, CLOSE_CHANNEL, NULL); ch->peer = NULL; } return; } switch (ch->protocol) { case ISDN_P_NT_S0: case ISDN_P_TE_S0: case ISDN_P_NT_E1: case ISDN_P_TE_E1: write_lock_bh(&ch->st->l1sock.lock); sk_del_node_init(&msk->sk); write_unlock_bh(&ch->st->l1sock.lock); ch->st->dev->D.ctrl(&ch->st->dev->D, CLOSE_CHANNEL, NULL); break; case ISDN_P_LAPD_TE: pch = get_channel4id(ch->st, ch->nr); if (pch) { mutex_lock(&ch->st->lmutex); list_del(&pch->list); mutex_unlock(&ch->st->lmutex); pch->ctrl(pch, CLOSE_CHANNEL, NULL); pch = ch->st->dev->teimgr; pch->ctrl(pch, CLOSE_CHANNEL, NULL); } else printk(KERN_WARNING "%s: no l2 channel\n", __func__); break; case ISDN_P_LAPD_NT: pch = ch->st->dev->teimgr; if (pch) { pch->ctrl(pch, CLOSE_CHANNEL, NULL); } else printk(KERN_WARNING "%s: no l2 channel\n", __func__); break; default: break; } return; } void delete_stack(struct mISDNdevice *dev) { struct mISDNstack *st = dev->D.st; DECLARE_COMPLETION_ONSTACK(done); if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: st(%s)\n", __func__, dev_name(&st->dev->dev)); if (dev->teimgr) delete_teimanager(dev->teimgr); if (st->thread) { if (st->notify) { printk(KERN_WARNING "%s: notifier in use\n", __func__); complete(st->notify); } st->notify = &done; test_and_set_bit(mISDN_STACK_ABORT, &st->status); test_and_set_bit(mISDN_STACK_WAKEUP, &st->status); wake_up_interruptible(&st->workq); wait_for_completion(&done); } if (!list_empty(&st->layer2)) printk(KERN_WARNING "%s: layer2 list not empty\n", __func__); if (!hlist_empty(&st->l1sock.head)) printk(KERN_WARNING "%s: layer1 list not empty\n", __func__); kfree(st); } void mISDN_initstack(u_int *dp) { debug = dp; }
33 18 1 25 47 43 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 /* * linux/fs/nls/nls_koi8-ru.c * * Charset koi8-ru translation based on charset koi8-u. * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static struct nls_table *p_nls; static int uni2char(const wchar_t uni, unsigned char *out, int boundlen) { if (boundlen <= 0) return -ENAMETOOLONG; if ((uni & 0xffaf) == 0x040e || (uni & 0xffce) == 0x254c) { /* koi8-ru and koi8-u differ only on two characters */ if (uni == 0x040e) out[0] = 0xbe; else if (uni == 0x045e) out[0] = 0xae; else if (uni == 0x255d || uni == 0x256c) return 0; else return p_nls->uni2char(uni, out, boundlen); return 1; } else /* fast path */ return p_nls->uni2char(uni, out, boundlen); } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { int n; if ((*rawstring & 0xef) != 0xae) { /* koi8-ru and koi8-u differ only on two characters */ *uni = (*rawstring & 0x10) ? 0x040e : 0x045e; return 1; } n = p_nls->char2uni(rawstring, boundlen, uni); return n; } static struct nls_table table = { .charset = "koi8-ru", .uni2char = uni2char, .char2uni = char2uni, }; static int __init init_nls_koi8_ru(void) { p_nls = load_nls("koi8-u"); if (p_nls) { table.charset2upper = p_nls->charset2upper; table.charset2lower = p_nls->charset2lower; return register_nls(&table); } return -EINVAL; } static void __exit exit_nls_koi8_ru(void) { unregister_nls(&table); unload_nls(p_nls); } module_init(init_nls_koi8_ru) module_exit(exit_nls_koi8_ru) MODULE_DESCRIPTION("NLS KOI8-RU (Belarusian)"); MODULE_LICENSE("Dual BSD/GPL");
4282 2 2 4178 108 4102 4104 3103 48 3083 8 8 4410 2639 332 4403 4412 2724 4407 4410 4402 4406 1256 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 // SPDX-License-Identifier: GPL-2.0 /* * drivers/base/power/main.c - Where the driver meets power management. * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab * * The driver model core calls device_pm_add() when a device is registered. * This will initialize the embedded device_pm_info object in the device * and add it to the list of power-controlled devices. sysfs entries for * controlling device power management will also be added. * * A separate list is used for keeping track of power info, because the power * domain dependencies may differ from the ancestral dependencies that the * subsystem list maintains. */ #define pr_fmt(fmt) "PM: " fmt #define dev_fmt pr_fmt #include <linux/device.h> #include <linux/export.h> #include <linux/mutex.h> #include <linux/pm.h> #include <linux/pm_runtime.h> #include <linux/pm-trace.h> #include <linux/pm_wakeirq.h> #include <linux/interrupt.h> #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/async.h> #include <linux/suspend.h> #include <trace/events/power.h> #include <linux/cpufreq.h> #include <linux/devfreq.h> #include <linux/timer.h> #include "../base.h" #include "power.h" typedef int (*pm_callback_t)(struct device *); /* * The entries in the dpm_list list are in a depth first order, simply * because children are guaranteed to be discovered after parents, and * are inserted at the back of the list on discovery. * * Since device_pm_add() may be called with a device lock held, * we must never try to acquire a device lock while holding * dpm_list_mutex. */ LIST_HEAD(dpm_list); static LIST_HEAD(dpm_prepared_list); static LIST_HEAD(dpm_suspended_list); static LIST_HEAD(dpm_late_early_list); static LIST_HEAD(dpm_noirq_list); static DEFINE_MUTEX(dpm_list_mtx); static pm_message_t pm_transition; static DEFINE_MUTEX(async_wip_mtx); static int async_error; /** * pm_hibernate_is_recovering - if recovering from hibernate due to error. * * Used to query if dev_pm_ops.thaw() is called for normal hibernation case or * recovering from some error. * * Return: true for error case, false for normal case. */ bool pm_hibernate_is_recovering(void) { return pm_transition.event == PM_EVENT_RECOVER; } EXPORT_SYMBOL_GPL(pm_hibernate_is_recovering); static const char *pm_verb(int event) { switch (event) { case PM_EVENT_SUSPEND: return "suspend"; case PM_EVENT_RESUME: return "resume"; case PM_EVENT_FREEZE: return "freeze"; case PM_EVENT_QUIESCE: return "quiesce"; case PM_EVENT_HIBERNATE: return "hibernate"; case PM_EVENT_THAW: return "thaw"; case PM_EVENT_RESTORE: return "restore"; case PM_EVENT_RECOVER: return "recover"; default: return "(unknown PM event)"; } } /** * device_pm_sleep_init - Initialize system suspend-related device fields. * @dev: Device object being initialized. */ void device_pm_sleep_init(struct device *dev) { dev->power.is_prepared = false; dev->power.is_suspended = false; dev->power.is_noirq_suspended = false; dev->power.is_late_suspended = false; init_completion(&dev->power.completion); complete_all(&dev->power.completion); dev->power.wakeup = NULL; INIT_LIST_HEAD(&dev->power.entry); } /** * device_pm_lock - Lock the list of active devices used by the PM core. */ void device_pm_lock(void) { mutex_lock(&dpm_list_mtx); } /** * device_pm_unlock - Unlock the list of active devices used by the PM core. */ void device_pm_unlock(void) { mutex_unlock(&dpm_list_mtx); } /** * device_pm_add - Add a device to the PM core's list of active devices. * @dev: Device to add to the list. */ void device_pm_add(struct device *dev) { /* Skip PM setup/initialization. */ if (device_pm_not_required(dev)) return; pr_debug("Adding info for %s:%s\n", dev->bus ? dev->bus->name : "No Bus", dev_name(dev)); device_pm_check_callbacks(dev); mutex_lock(&dpm_list_mtx); if (dev->parent && dev->parent->power.is_prepared) dev_warn(dev, "parent %s should not be sleeping\n", dev_name(dev->parent)); list_add_tail(&dev->power.entry, &dpm_list); dev->power.in_dpm_list = true; mutex_unlock(&dpm_list_mtx); } /** * device_pm_remove - Remove a device from the PM core's list of active devices. * @dev: Device to be removed from the list. */ void device_pm_remove(struct device *dev) { if (device_pm_not_required(dev)) return; pr_debug("Removing info for %s:%s\n", dev->bus ? dev->bus->name : "No Bus", dev_name(dev)); complete_all(&dev->power.completion); mutex_lock(&dpm_list_mtx); list_del_init(&dev->power.entry); dev->power.in_dpm_list = false; mutex_unlock(&dpm_list_mtx); device_wakeup_disable(dev); pm_runtime_remove(dev); device_pm_check_callbacks(dev); } /** * device_pm_move_before - Move device in the PM core's list of active devices. * @deva: Device to move in dpm_list. * @devb: Device @deva should come before. */ void device_pm_move_before(struct device *deva, struct device *devb) { pr_debug("Moving %s:%s before %s:%s\n", deva->bus ? deva->bus->name : "No Bus", dev_name(deva), devb->bus ? devb->bus->name : "No Bus", dev_name(devb)); /* Delete deva from dpm_list and reinsert before devb. */ list_move_tail(&deva->power.entry, &devb->power.entry); } /** * device_pm_move_after - Move device in the PM core's list of active devices. * @deva: Device to move in dpm_list. * @devb: Device @deva should come after. */ void device_pm_move_after(struct device *deva, struct device *devb) { pr_debug("Moving %s:%s after %s:%s\n", deva->bus ? deva->bus->name : "No Bus", dev_name(deva), devb->bus ? devb->bus->name : "No Bus", dev_name(devb)); /* Delete deva from dpm_list and reinsert after devb. */ list_move(&deva->power.entry, &devb->power.entry); } /** * device_pm_move_last - Move device to end of the PM core's list of devices. * @dev: Device to move in dpm_list. */ void device_pm_move_last(struct device *dev) { pr_debug("Moving %s:%s to end of list\n", dev->bus ? dev->bus->name : "No Bus", dev_name(dev)); list_move_tail(&dev->power.entry, &dpm_list); } static ktime_t initcall_debug_start(struct device *dev, void *cb) { if (!pm_print_times_enabled) return 0; dev_info(dev, "calling %ps @ %i, parent: %s\n", cb, task_pid_nr(current), dev->parent ? dev_name(dev->parent) : "none"); return ktime_get(); } static void initcall_debug_report(struct device *dev, ktime_t calltime, void *cb, int error) { ktime_t rettime; if (!pm_print_times_enabled) return; rettime = ktime_get(); dev_info(dev, "%ps returned %d after %Ld usecs\n", cb, error, (unsigned long long)ktime_us_delta(rettime, calltime)); } /** * dpm_wait - Wait for a PM operation to complete. * @dev: Device to wait for. * @async: If unset, wait only if the device's power.async_suspend flag is set. */ static void dpm_wait(struct device *dev, bool async) { if (!dev) return; if (async || (pm_async_enabled && dev->power.async_suspend)) wait_for_completion(&dev->power.completion); } static int dpm_wait_fn(struct device *dev, void *async_ptr) { dpm_wait(dev, *((bool *)async_ptr)); return 0; } static void dpm_wait_for_children(struct device *dev, bool async) { device_for_each_child(dev, &async, dpm_wait_fn); } static void dpm_wait_for_suppliers(struct device *dev, bool async) { struct device_link *link; int idx; idx = device_links_read_lock(); /* * If the supplier goes away right after we've checked the link to it, * we'll wait for its completion to change the state, but that's fine, * because the only things that will block as a result are the SRCU * callbacks freeing the link objects for the links in the list we're * walking. */ dev_for_each_link_to_supplier(link, dev) if (READ_ONCE(link->status) != DL_STATE_DORMANT && !device_link_flag_is_sync_state_only(link->flags)) dpm_wait(link->supplier, async); device_links_read_unlock(idx); } static bool dpm_wait_for_superior(struct device *dev, bool async) { struct device *parent; /* * If the device is resumed asynchronously and the parent's callback * deletes both the device and the parent itself, the parent object may * be freed while this function is running, so avoid that by reference * counting the parent once more unless the device has been deleted * already (in which case return right away). */ mutex_lock(&dpm_list_mtx); if (!device_pm_initialized(dev)) { mutex_unlock(&dpm_list_mtx); return false; } parent = get_device(dev->parent); mutex_unlock(&dpm_list_mtx); dpm_wait(parent, async); put_device(parent); dpm_wait_for_suppliers(dev, async); /* * If the parent's callback has deleted the device, attempting to resume * it would be invalid, so avoid doing that then. */ return device_pm_initialized(dev); } static void dpm_wait_for_consumers(struct device *dev, bool async) { struct device_link *link; int idx; idx = device_links_read_lock(); /* * The status of a device link can only be changed from "dormant" by a * probe, but that cannot happen during system suspend/resume. In * theory it can change to "dormant" at that time, but then it is * reasonable to wait for the target device anyway (eg. if it goes * away, it's better to wait for it to go away completely and then * continue instead of trying to continue in parallel with its * unregistration). */ dev_for_each_link_to_consumer(link, dev) if (READ_ONCE(link->status) != DL_STATE_DORMANT && !device_link_flag_is_sync_state_only(link->flags)) dpm_wait(link->consumer, async); device_links_read_unlock(idx); } static void dpm_wait_for_subordinate(struct device *dev, bool async) { dpm_wait_for_children(dev, async); dpm_wait_for_consumers(dev, async); } /** * pm_op - Return the PM operation appropriate for given PM event. * @ops: PM operations to choose from. * @state: PM transition of the system being carried out. */ static pm_callback_t pm_op(const struct dev_pm_ops *ops, pm_message_t state) { switch (state.event) { #ifdef CONFIG_SUSPEND case PM_EVENT_SUSPEND: return ops->suspend; case PM_EVENT_RESUME: return ops->resume; #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze; case PM_EVENT_HIBERNATE: return ops->poweroff; case PM_EVENT_THAW: case PM_EVENT_RECOVER: return ops->thaw; case PM_EVENT_RESTORE: return ops->restore; #endif /* CONFIG_HIBERNATE_CALLBACKS */ } return NULL; } /** * pm_late_early_op - Return the PM operation appropriate for given PM event. * @ops: PM operations to choose from. * @state: PM transition of the system being carried out. * * Runtime PM is disabled for @dev while this function is being executed. */ static pm_callback_t pm_late_early_op(const struct dev_pm_ops *ops, pm_message_t state) { switch (state.event) { #ifdef CONFIG_SUSPEND case PM_EVENT_SUSPEND: return ops->suspend_late; case PM_EVENT_RESUME: return ops->resume_early; #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze_late; case PM_EVENT_HIBERNATE: return ops->poweroff_late; case PM_EVENT_THAW: case PM_EVENT_RECOVER: return ops->thaw_early; case PM_EVENT_RESTORE: return ops->restore_early; #endif /* CONFIG_HIBERNATE_CALLBACKS */ } return NULL; } /** * pm_noirq_op - Return the PM operation appropriate for given PM event. * @ops: PM operations to choose from. * @state: PM transition of the system being carried out. * * The driver of @dev will not receive interrupts while this function is being * executed. */ static pm_callback_t pm_noirq_op(const struct dev_pm_ops *ops, pm_message_t state) { switch (state.event) { #ifdef CONFIG_SUSPEND case PM_EVENT_SUSPEND: return ops->suspend_noirq; case PM_EVENT_RESUME: return ops->resume_noirq; #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze_noirq; case PM_EVENT_HIBERNATE: return ops->poweroff_noirq; case PM_EVENT_THAW: case PM_EVENT_RECOVER: return ops->thaw_noirq; case PM_EVENT_RESTORE: return ops->restore_noirq; #endif /* CONFIG_HIBERNATE_CALLBACKS */ } return NULL; } static void pm_dev_dbg(struct device *dev, pm_message_t state, const char *info) { dev_dbg(dev, "%s%s%s driver flags: %x\n", info, pm_verb(state.event), ((state.event & PM_EVENT_SLEEP) && device_may_wakeup(dev)) ? ", may wakeup" : "", dev->power.driver_flags); } static void pm_dev_err(struct device *dev, pm_message_t state, const char *info, int error) { dev_err(dev, "failed to %s%s: error %d\n", pm_verb(state.event), info, error); } static void dpm_show_time(ktime_t starttime, pm_message_t state, int error, const char *info) { ktime_t calltime; u64 usecs64; int usecs; calltime = ktime_get(); usecs64 = ktime_to_ns(ktime_sub(calltime, starttime)); do_div(usecs64, NSEC_PER_USEC); usecs = usecs64; if (usecs == 0) usecs = 1; pm_pr_dbg("%s%s%s of devices %s after %ld.%03ld msecs\n", info ?: "", info ? " " : "", pm_verb(state.event), error ? "aborted" : "complete", usecs / USEC_PER_MSEC, usecs % USEC_PER_MSEC); } static int dpm_run_callback(pm_callback_t cb, struct device *dev, pm_message_t state, const char *info) { ktime_t calltime; int error; if (!cb) return 0; calltime = initcall_debug_start(dev, cb); pm_dev_dbg(dev, state, info); trace_device_pm_callback_start(dev, info, state.event); error = cb(dev); trace_device_pm_callback_end(dev, error); suspend_report_result(dev, cb, error); initcall_debug_report(dev, calltime, cb, error); return error; } #ifdef CONFIG_DPM_WATCHDOG struct dpm_watchdog { struct device *dev; struct task_struct *tsk; struct timer_list timer; bool fatal; }; #define DECLARE_DPM_WATCHDOG_ON_STACK(wd) \ struct dpm_watchdog wd /** * dpm_watchdog_handler - Driver suspend / resume watchdog handler. * @t: The timer that PM watchdog depends on. * * Called when a driver has timed out suspending or resuming. * There's not much we can do here to recover so panic() to * capture a crash-dump in pstore. */ static void dpm_watchdog_handler(struct timer_list *t) { struct dpm_watchdog *wd = timer_container_of(wd, t, timer); struct timer_list *timer = &wd->timer; unsigned int time_left; if (wd->fatal) { dev_emerg(wd->dev, "**** DPM device timeout ****\n"); show_stack(wd->tsk, NULL, KERN_EMERG); panic("%s %s: unrecoverable failure\n", dev_driver_string(wd->dev), dev_name(wd->dev)); } time_left = CONFIG_DPM_WATCHDOG_TIMEOUT - CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT; dev_warn(wd->dev, "**** DPM device timeout after %u seconds; %u seconds until panic ****\n", CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT, time_left); show_stack(wd->tsk, NULL, KERN_WARNING); wd->fatal = true; mod_timer(timer, jiffies + HZ * time_left); } /** * dpm_watchdog_set - Enable pm watchdog for given device. * @wd: Watchdog. Must be allocated on the stack. * @dev: Device to handle. */ static void dpm_watchdog_set(struct dpm_watchdog *wd, struct device *dev) { struct timer_list *timer = &wd->timer; wd->dev = dev; wd->tsk = current; wd->fatal = CONFIG_DPM_WATCHDOG_TIMEOUT == CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT; timer_setup_on_stack(timer, dpm_watchdog_handler, 0); /* use same timeout value for both suspend and resume */ timer->expires = jiffies + HZ * CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT; add_timer(timer); } /** * dpm_watchdog_clear - Disable suspend/resume watchdog. * @wd: Watchdog to disable. */ static void dpm_watchdog_clear(struct dpm_watchdog *wd) { struct timer_list *timer = &wd->timer; timer_delete_sync(timer); timer_destroy_on_stack(timer); } #else #define DECLARE_DPM_WATCHDOG_ON_STACK(wd) #define dpm_watchdog_set(x, y) #define dpm_watchdog_clear(x) #endif /*------------------------- Resume routines -------------------------*/ /** * dev_pm_skip_resume - System-wide device resume optimization check. * @dev: Target device. * * Return: * - %false if the transition under way is RESTORE. * - Return value of dev_pm_skip_suspend() if the transition under way is THAW. * - The logical negation of %power.must_resume otherwise (that is, when the * transition under way is RESUME). */ bool dev_pm_skip_resume(struct device *dev) { if (pm_transition.event == PM_EVENT_RESTORE) return false; if (pm_transition.event == PM_EVENT_THAW) return dev_pm_skip_suspend(dev); return !dev->power.must_resume; } static bool is_async(struct device *dev) { return dev->power.async_suspend && pm_async_enabled && !pm_trace_is_enabled(); } static bool __dpm_async(struct device *dev, async_func_t func) { if (dev->power.work_in_progress) return true; if (!is_async(dev)) return false; dev->power.work_in_progress = true; get_device(dev); if (async_schedule_dev_nocall(func, dev)) return true; put_device(dev); return false; } static bool dpm_async_fn(struct device *dev, async_func_t func) { guard(mutex)(&async_wip_mtx); return __dpm_async(dev, func); } static int dpm_async_with_cleanup(struct device *dev, void *fn) { guard(mutex)(&async_wip_mtx); if (!__dpm_async(dev, fn)) dev->power.work_in_progress = false; return 0; } static void dpm_async_resume_children(struct device *dev, async_func_t func) { /* * Prevent racing with dpm_clear_async_state() during initial list * walks in dpm_noirq_resume_devices(), dpm_resume_early(), and * dpm_resume(). */ guard(mutex)(&dpm_list_mtx); /* * Start processing "async" children of the device unless it's been * started already for them. */ device_for_each_child(dev, func, dpm_async_with_cleanup); } static void dpm_async_resume_subordinate(struct device *dev, async_func_t func) { struct device_link *link; int idx; dpm_async_resume_children(dev, func); idx = device_links_read_lock(); /* Start processing the device's "async" consumers. */ dev_for_each_link_to_consumer(link, dev) if (READ_ONCE(link->status) != DL_STATE_DORMANT) dpm_async_with_cleanup(link->consumer, func); device_links_read_unlock(idx); } static void dpm_clear_async_state(struct device *dev) { reinit_completion(&dev->power.completion); dev->power.work_in_progress = false; } static bool dpm_root_device(struct device *dev) { lockdep_assert_held(&dpm_list_mtx); /* * Since this function is required to run under dpm_list_mtx, the * list_empty() below will only return true if the device's list of * consumers is actually empty before calling it. */ return !dev->parent && list_empty(&dev->links.suppliers); } static void async_resume_noirq(void *data, async_cookie_t cookie); /** * device_resume_noirq - Execute a "noirq resume" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being resumed asynchronously. * * The driver of @dev will not receive interrupts while this function is being * executed. */ static void device_resume_noirq(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; bool skip_resume; int error = 0; TRACE_DEVICE(dev); TRACE_RESUME(0); if (dev->power.syscore || dev->power.direct_complete) goto Out; if (!dev->power.is_noirq_suspended) { /* * This means that system suspend has been aborted in the noirq * phase before invoking the noirq suspend callback for the * device, so if device_suspend_late() has left it in suspend, * device_resume_early() should leave it in suspend either in * case the early resume of it depends on the noirq resume that * has not run. */ if (dev_pm_skip_suspend(dev)) dev->power.must_resume = false; goto Out; } if (!dpm_wait_for_superior(dev, async)) goto Out; skip_resume = dev_pm_skip_resume(dev); /* * If the driver callback is skipped below or by the middle layer * callback and device_resume_early() also skips the driver callback for * this device later, it needs to appear as "suspended" to PM-runtime, * so change its status accordingly. * * Otherwise, the device is going to be resumed, so set its PM-runtime * status to "active" unless its power.smart_suspend flag is clear, in * which case it is not necessary to update its PM-runtime status. */ if (skip_resume) pm_runtime_set_suspended(dev); else if (dev_pm_smart_suspend(dev)) pm_runtime_set_active(dev); if (dev->pm_domain) { info = "noirq power domain "; callback = pm_noirq_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "noirq type "; callback = pm_noirq_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "noirq class "; callback = pm_noirq_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "noirq bus "; callback = pm_noirq_op(dev->bus->pm, state); } if (callback) goto Run; if (skip_resume) goto Skip; if (dev->driver && dev->driver->pm) { info = "noirq driver "; callback = pm_noirq_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); Skip: dev->power.is_noirq_suspended = false; Out: complete_all(&dev->power.completion); TRACE_RESUME(error); if (error) { WRITE_ONCE(async_error, error); dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async noirq" : " noirq", error); } dpm_async_resume_subordinate(dev, async_resume_noirq); } static void async_resume_noirq(void *data, async_cookie_t cookie) { struct device *dev = data; device_resume_noirq(dev, pm_transition, true); put_device(dev); } static void dpm_noirq_resume_devices(pm_message_t state) { struct device *dev; ktime_t starttime = ktime_get(); trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, true); async_error = 0; pm_transition = state; mutex_lock(&dpm_list_mtx); /* * Start processing "async" root devices upfront so they don't wait for * the "sync" devices they don't depend on. */ list_for_each_entry(dev, &dpm_noirq_list, power.entry) { dpm_clear_async_state(dev); if (dpm_root_device(dev)) dpm_async_with_cleanup(dev, async_resume_noirq); } while (!list_empty(&dpm_noirq_list)) { dev = to_device(dpm_noirq_list.next); list_move_tail(&dev->power.entry, &dpm_late_early_list); if (!dpm_async_fn(dev, async_resume_noirq)) { get_device(dev); mutex_unlock(&dpm_list_mtx); device_resume_noirq(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, 0, "noirq"); if (READ_ONCE(async_error)) dpm_save_failed_step(SUSPEND_RESUME_NOIRQ); trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, false); } /** * dpm_resume_noirq - Execute "noirq resume" callbacks for all devices. * @state: PM transition of the system being carried out. * * Invoke the "noirq" resume callbacks for all devices in dpm_noirq_list and * allow device drivers' interrupt handlers to be called. */ void dpm_resume_noirq(pm_message_t state) { dpm_noirq_resume_devices(state); resume_device_irqs(); device_wakeup_disarm_wake_irqs(); } static void async_resume_early(void *data, async_cookie_t cookie); /** * device_resume_early - Execute an "early resume" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being resumed asynchronously. * * Runtime PM is disabled for @dev while this function is being executed. */ static void device_resume_early(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; TRACE_DEVICE(dev); TRACE_RESUME(0); if (dev->power.syscore || dev->power.direct_complete) goto Out; if (!dev->power.is_late_suspended) goto Out; if (!dpm_wait_for_superior(dev, async)) goto Out; if (dev->pm_domain) { info = "early power domain "; callback = pm_late_early_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "early type "; callback = pm_late_early_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "early class "; callback = pm_late_early_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "early bus "; callback = pm_late_early_op(dev->bus->pm, state); } if (callback) goto Run; if (dev_pm_skip_resume(dev)) goto Skip; if (dev->driver && dev->driver->pm) { info = "early driver "; callback = pm_late_early_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); Skip: dev->power.is_late_suspended = false; Out: TRACE_RESUME(error); pm_runtime_enable(dev); complete_all(&dev->power.completion); if (error) { WRITE_ONCE(async_error, error); dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async early" : " early", error); } dpm_async_resume_subordinate(dev, async_resume_early); } static void async_resume_early(void *data, async_cookie_t cookie) { struct device *dev = data; device_resume_early(dev, pm_transition, true); put_device(dev); } /** * dpm_resume_early - Execute "early resume" callbacks for all devices. * @state: PM transition of the system being carried out. */ void dpm_resume_early(pm_message_t state) { struct device *dev; ktime_t starttime = ktime_get(); trace_suspend_resume(TPS("dpm_resume_early"), state.event, true); async_error = 0; pm_transition = state; mutex_lock(&dpm_list_mtx); /* * Start processing "async" root devices upfront so they don't wait for * the "sync" devices they don't depend on. */ list_for_each_entry(dev, &dpm_late_early_list, power.entry) { dpm_clear_async_state(dev); if (dpm_root_device(dev)) dpm_async_with_cleanup(dev, async_resume_early); } while (!list_empty(&dpm_late_early_list)) { dev = to_device(dpm_late_early_list.next); list_move_tail(&dev->power.entry, &dpm_suspended_list); if (!dpm_async_fn(dev, async_resume_early)) { get_device(dev); mutex_unlock(&dpm_list_mtx); device_resume_early(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, 0, "early"); if (READ_ONCE(async_error)) dpm_save_failed_step(SUSPEND_RESUME_EARLY); trace_suspend_resume(TPS("dpm_resume_early"), state.event, false); } /** * dpm_resume_start - Execute "noirq" and "early" device callbacks. * @state: PM transition of the system being carried out. */ void dpm_resume_start(pm_message_t state) { dpm_resume_noirq(state); dpm_resume_early(state); } EXPORT_SYMBOL_GPL(dpm_resume_start); static void async_resume(void *data, async_cookie_t cookie); /** * device_resume - Execute "resume" callbacks for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being resumed asynchronously. */ static void device_resume(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; DECLARE_DPM_WATCHDOG_ON_STACK(wd); TRACE_DEVICE(dev); TRACE_RESUME(0); if (dev->power.syscore) goto Complete; if (!dev->power.is_suspended) goto Complete; dev->power.is_suspended = false; if (dev->power.direct_complete) { /* * Allow new children to be added under the device after this * point if it has no PM callbacks. */ if (dev->power.no_pm_callbacks) dev->power.is_prepared = false; /* Match the pm_runtime_disable() in device_suspend(). */ pm_runtime_enable(dev); goto Complete; } if (!dpm_wait_for_superior(dev, async)) goto Complete; dpm_watchdog_set(&wd, dev); device_lock(dev); /* * This is a fib. But we'll allow new children to be added below * a resumed device, even if the device hasn't been completed yet. */ dev->power.is_prepared = false; if (dev->pm_domain) { info = "power domain "; callback = pm_op(&dev->pm_domain->ops, state); goto Driver; } if (dev->type && dev->type->pm) { info = "type "; callback = pm_op(dev->type->pm, state); goto Driver; } if (dev->class && dev->class->pm) { info = "class "; callback = pm_op(dev->class->pm, state); goto Driver; } if (dev->bus) { if (dev->bus->pm) { info = "bus "; callback = pm_op(dev->bus->pm, state); } else if (dev->bus->resume) { info = "legacy bus "; callback = dev->bus->resume; goto End; } } Driver: if (!callback && dev->driver && dev->driver->pm) { info = "driver "; callback = pm_op(dev->driver->pm, state); } End: error = dpm_run_callback(callback, dev, state, info); device_unlock(dev); dpm_watchdog_clear(&wd); Complete: complete_all(&dev->power.completion); TRACE_RESUME(error); if (error) { WRITE_ONCE(async_error, error); dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async" : "", error); } dpm_async_resume_subordinate(dev, async_resume); } static void async_resume(void *data, async_cookie_t cookie) { struct device *dev = data; device_resume(dev, pm_transition, true); put_device(dev); } /** * dpm_resume - Execute "resume" callbacks for non-sysdev devices. * @state: PM transition of the system being carried out. * * Execute the appropriate "resume" callback for all devices whose status * indicates that they are suspended. */ void dpm_resume(pm_message_t state) { struct device *dev; ktime_t starttime = ktime_get(); trace_suspend_resume(TPS("dpm_resume"), state.event, true); pm_transition = state; async_error = 0; mutex_lock(&dpm_list_mtx); /* * Start processing "async" root devices upfront so they don't wait for * the "sync" devices they don't depend on. */ list_for_each_entry(dev, &dpm_suspended_list, power.entry) { dpm_clear_async_state(dev); if (dpm_root_device(dev)) dpm_async_with_cleanup(dev, async_resume); } while (!list_empty(&dpm_suspended_list)) { dev = to_device(dpm_suspended_list.next); list_move_tail(&dev->power.entry, &dpm_prepared_list); if (!dpm_async_fn(dev, async_resume)) { get_device(dev); mutex_unlock(&dpm_list_mtx); device_resume(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, 0, NULL); if (READ_ONCE(async_error)) dpm_save_failed_step(SUSPEND_RESUME); cpufreq_resume(); devfreq_resume(); trace_suspend_resume(TPS("dpm_resume"), state.event, false); } /** * device_complete - Complete a PM transition for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. */ static void device_complete(struct device *dev, pm_message_t state) { void (*callback)(struct device *) = NULL; const char *info = NULL; if (dev->power.syscore) goto out; device_lock(dev); if (dev->pm_domain) { info = "completing power domain "; callback = dev->pm_domain->ops.complete; } else if (dev->type && dev->type->pm) { info = "completing type "; callback = dev->type->pm->complete; } else if (dev->class && dev->class->pm) { info = "completing class "; callback = dev->class->pm->complete; } else if (dev->bus && dev->bus->pm) { info = "completing bus "; callback = dev->bus->pm->complete; } if (!callback && dev->driver && dev->driver->pm) { info = "completing driver "; callback = dev->driver->pm->complete; } if (callback) { pm_dev_dbg(dev, state, info); callback(dev); } device_unlock(dev); out: /* If enabling runtime PM for the device is blocked, unblock it. */ pm_runtime_unblock(dev); pm_runtime_put(dev); } /** * dpm_complete - Complete a PM transition for all non-sysdev devices. * @state: PM transition of the system being carried out. * * Execute the ->complete() callbacks for all devices whose PM status is not * DPM_ON (this allows new devices to be registered). */ void dpm_complete(pm_message_t state) { struct list_head list; trace_suspend_resume(TPS("dpm_complete"), state.event, true); INIT_LIST_HEAD(&list); mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_prepared_list)) { struct device *dev = to_device(dpm_prepared_list.prev); get_device(dev); dev->power.is_prepared = false; list_move(&dev->power.entry, &list); mutex_unlock(&dpm_list_mtx); trace_device_pm_callback_start(dev, "", state.event); device_complete(dev, state); trace_device_pm_callback_end(dev, 0); put_device(dev); mutex_lock(&dpm_list_mtx); } list_splice(&list, &dpm_list); mutex_unlock(&dpm_list_mtx); /* Allow device probing and trigger re-probing of deferred devices */ device_unblock_probing(); trace_suspend_resume(TPS("dpm_complete"), state.event, false); } /** * dpm_resume_end - Execute "resume" callbacks and complete system transition. * @state: PM transition of the system being carried out. * * Execute "resume" callbacks for all devices and complete the PM transition of * the system. */ void dpm_resume_end(pm_message_t state) { dpm_resume(state); pm_restore_gfp_mask(); dpm_complete(state); } EXPORT_SYMBOL_GPL(dpm_resume_end); /*------------------------- Suspend routines -------------------------*/ static bool dpm_leaf_device(struct device *dev) { struct device *child; lockdep_assert_held(&dpm_list_mtx); child = device_find_any_child(dev); if (child) { put_device(child); return false; } /* * Since this function is required to run under dpm_list_mtx, the * list_empty() below will only return true if the device's list of * consumers is actually empty before calling it. */ return list_empty(&dev->links.consumers); } static bool dpm_async_suspend_parent(struct device *dev, async_func_t func) { guard(mutex)(&dpm_list_mtx); /* * If the device is suspended asynchronously and the parent's callback * deletes both the device and the parent itself, the parent object may * be freed while this function is running, so avoid that by checking * if the device has been deleted already as the parent cannot be * deleted before it. */ if (!device_pm_initialized(dev)) return false; /* Start processing the device's parent if it is "async". */ if (dev->parent) dpm_async_with_cleanup(dev->parent, func); return true; } static void dpm_async_suspend_superior(struct device *dev, async_func_t func) { struct device_link *link; int idx; if (!dpm_async_suspend_parent(dev, func)) return; idx = device_links_read_lock(); /* Start processing the device's "async" suppliers. */ dev_for_each_link_to_supplier(link, dev) if (READ_ONCE(link->status) != DL_STATE_DORMANT) dpm_async_with_cleanup(link->supplier, func); device_links_read_unlock(idx); } static void dpm_async_suspend_complete_all(struct list_head *device_list) { struct device *dev; guard(mutex)(&async_wip_mtx); list_for_each_entry_reverse(dev, device_list, power.entry) { /* * In case the device is being waited for and async processing * has not started for it yet, let the waiters make progress. */ if (!dev->power.work_in_progress) complete_all(&dev->power.completion); } } /** * resume_event - Return a "resume" message for given "suspend" sleep state. * @sleep_state: PM message representing a sleep state. * * Return a PM message representing the resume event corresponding to given * sleep state. */ static pm_message_t resume_event(pm_message_t sleep_state) { switch (sleep_state.event) { case PM_EVENT_SUSPEND: return PMSG_RESUME; case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return PMSG_RECOVER; case PM_EVENT_HIBERNATE: return PMSG_RESTORE; } return PMSG_ON; } static void dpm_superior_set_must_resume(struct device *dev) { struct device_link *link; int idx; if (dev->parent) dev->parent->power.must_resume = true; idx = device_links_read_lock(); dev_for_each_link_to_supplier(link, dev) link->supplier->power.must_resume = true; device_links_read_unlock(idx); } static void async_suspend_noirq(void *data, async_cookie_t cookie); /** * device_suspend_noirq - Execute a "noirq suspend" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being suspended asynchronously. * * The driver of @dev will not receive interrupts while this function is being * executed. */ static void device_suspend_noirq(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; TRACE_DEVICE(dev); TRACE_SUSPEND(0); dpm_wait_for_subordinate(dev, async); if (READ_ONCE(async_error)) goto Complete; if (dev->power.syscore || dev->power.direct_complete) goto Complete; if (dev->pm_domain) { info = "noirq power domain "; callback = pm_noirq_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "noirq type "; callback = pm_noirq_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "noirq class "; callback = pm_noirq_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "noirq bus "; callback = pm_noirq_op(dev->bus->pm, state); } if (callback) goto Run; if (dev_pm_skip_suspend(dev)) goto Skip; if (dev->driver && dev->driver->pm) { info = "noirq driver "; callback = pm_noirq_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); if (error) { WRITE_ONCE(async_error, error); dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async noirq" : " noirq", error); goto Complete; } Skip: dev->power.is_noirq_suspended = true; /* * Devices must be resumed unless they are explicitly allowed to be left * in suspend, but even in that case skipping the resume of devices that * were in use right before the system suspend (as indicated by their * runtime PM usage counters and child counters) would be suboptimal. */ if (!(dev_pm_test_driver_flags(dev, DPM_FLAG_MAY_SKIP_RESUME) && dev->power.may_skip_resume) || !pm_runtime_need_not_resume(dev)) dev->power.must_resume = true; if (dev->power.must_resume) dpm_superior_set_must_resume(dev); Complete: complete_all(&dev->power.completion); TRACE_SUSPEND(error); if (error || READ_ONCE(async_error)) return; dpm_async_suspend_superior(dev, async_suspend_noirq); } static void async_suspend_noirq(void *data, async_cookie_t cookie) { struct device *dev = data; device_suspend_noirq(dev, pm_transition, true); put_device(dev); } static int dpm_noirq_suspend_devices(pm_message_t state) { ktime_t starttime = ktime_get(); struct device *dev; int error; trace_suspend_resume(TPS("dpm_suspend_noirq"), state.event, true); pm_transition = state; async_error = 0; mutex_lock(&dpm_list_mtx); /* * Start processing "async" leaf devices upfront so they don't need to * wait for the "sync" devices they don't depend on. */ list_for_each_entry_reverse(dev, &dpm_late_early_list, power.entry) { dpm_clear_async_state(dev); if (dpm_leaf_device(dev)) dpm_async_with_cleanup(dev, async_suspend_noirq); } while (!list_empty(&dpm_late_early_list)) { dev = to_device(dpm_late_early_list.prev); list_move(&dev->power.entry, &dpm_noirq_list); if (dpm_async_fn(dev, async_suspend_noirq)) continue; get_device(dev); mutex_unlock(&dpm_list_mtx); device_suspend_noirq(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); if (READ_ONCE(async_error)) { dpm_async_suspend_complete_all(&dpm_late_early_list); /* * Move all devices to the target list to resume them * properly. */ list_splice_init(&dpm_late_early_list, &dpm_noirq_list); break; } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); error = READ_ONCE(async_error); if (error) dpm_save_failed_step(SUSPEND_SUSPEND_NOIRQ); dpm_show_time(starttime, state, error, "noirq"); trace_suspend_resume(TPS("dpm_suspend_noirq"), state.event, false); return error; } /** * dpm_suspend_noirq - Execute "noirq suspend" callbacks for all devices. * @state: PM transition of the system being carried out. * * Prevent device drivers' interrupt handlers from being called and invoke * "noirq" suspend callbacks for all non-sysdev devices. */ int dpm_suspend_noirq(pm_message_t state) { int ret; device_wakeup_arm_wake_irqs(); suspend_device_irqs(); ret = dpm_noirq_suspend_devices(state); if (ret) dpm_resume_noirq(resume_event(state)); return ret; } static void dpm_propagate_wakeup_to_parent(struct device *dev) { struct device *parent = dev->parent; if (!parent) return; spin_lock_irq(&parent->power.lock); if (device_wakeup_path(dev) && !parent->power.ignore_children) parent->power.wakeup_path = true; spin_unlock_irq(&parent->power.lock); } static void async_suspend_late(void *data, async_cookie_t cookie); /** * device_suspend_late - Execute a "late suspend" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being suspended asynchronously. * * Runtime PM is disabled for @dev while this function is being executed. */ static void device_suspend_late(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; TRACE_DEVICE(dev); TRACE_SUSPEND(0); /* * Disable runtime PM for the device without checking if there is a * pending resume request for it. */ __pm_runtime_disable(dev, false); dpm_wait_for_subordinate(dev, async); if (READ_ONCE(async_error)) goto Complete; if (pm_wakeup_pending()) { WRITE_ONCE(async_error, -EBUSY); goto Complete; } if (dev->power.syscore || dev->power.direct_complete) goto Complete; if (dev->pm_domain) { info = "late power domain "; callback = pm_late_early_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "late type "; callback = pm_late_early_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "late class "; callback = pm_late_early_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "late bus "; callback = pm_late_early_op(dev->bus->pm, state); } if (callback) goto Run; if (dev_pm_skip_suspend(dev)) goto Skip; if (dev->driver && dev->driver->pm) { info = "late driver "; callback = pm_late_early_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); if (error) { WRITE_ONCE(async_error, error); dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async late" : " late", error); goto Complete; } dpm_propagate_wakeup_to_parent(dev); Skip: dev->power.is_late_suspended = true; Complete: TRACE_SUSPEND(error); complete_all(&dev->power.completion); if (error || READ_ONCE(async_error)) return; dpm_async_suspend_superior(dev, async_suspend_late); } static void async_suspend_late(void *data, async_cookie_t cookie) { struct device *dev = data; device_suspend_late(dev, pm_transition, true); put_device(dev); } /** * dpm_suspend_late - Execute "late suspend" callbacks for all devices. * @state: PM transition of the system being carried out. */ int dpm_suspend_late(pm_message_t state) { ktime_t starttime = ktime_get(); struct device *dev; int error; trace_suspend_resume(TPS("dpm_suspend_late"), state.event, true); pm_transition = state; async_error = 0; wake_up_all_idle_cpus(); mutex_lock(&dpm_list_mtx); /* * Start processing "async" leaf devices upfront so they don't need to * wait for the "sync" devices they don't depend on. */ list_for_each_entry_reverse(dev, &dpm_suspended_list, power.entry) { dpm_clear_async_state(dev); if (dpm_leaf_device(dev)) dpm_async_with_cleanup(dev, async_suspend_late); } while (!list_empty(&dpm_suspended_list)) { dev = to_device(dpm_suspended_list.prev); list_move(&dev->power.entry, &dpm_late_early_list); if (dpm_async_fn(dev, async_suspend_late)) continue; get_device(dev); mutex_unlock(&dpm_list_mtx); device_suspend_late(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); if (READ_ONCE(async_error)) { dpm_async_suspend_complete_all(&dpm_suspended_list); /* * Move all devices to the target list to resume them * properly. */ list_splice_init(&dpm_suspended_list, &dpm_late_early_list); break; } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); error = READ_ONCE(async_error); if (error) { dpm_save_failed_step(SUSPEND_SUSPEND_LATE); dpm_resume_early(resume_event(state)); } dpm_show_time(starttime, state, error, "late"); trace_suspend_resume(TPS("dpm_suspend_late"), state.event, false); return error; } /** * dpm_suspend_end - Execute "late" and "noirq" device suspend callbacks. * @state: PM transition of the system being carried out. */ int dpm_suspend_end(pm_message_t state) { ktime_t starttime = ktime_get(); int error; error = dpm_suspend_late(state); if (error) goto out; error = dpm_suspend_noirq(state); if (error) dpm_resume_early(resume_event(state)); out: dpm_show_time(starttime, state, error, "end"); return error; } EXPORT_SYMBOL_GPL(dpm_suspend_end); /** * legacy_suspend - Execute a legacy (bus or class) suspend callback for device. * @dev: Device to suspend. * @state: PM transition of the system being carried out. * @cb: Suspend callback to execute. * @info: string description of caller. */ static int legacy_suspend(struct device *dev, pm_message_t state, int (*cb)(struct device *dev, pm_message_t state), const char *info) { int error; ktime_t calltime; calltime = initcall_debug_start(dev, cb); trace_device_pm_callback_start(dev, info, state.event); error = cb(dev, state); trace_device_pm_callback_end(dev, error); suspend_report_result(dev, cb, error); initcall_debug_report(dev, calltime, cb, error); return error; } static void dpm_clear_superiors_direct_complete(struct device *dev) { struct device_link *link; int idx; if (dev->parent) { spin_lock_irq(&dev->parent->power.lock); dev->parent->power.direct_complete = false; spin_unlock_irq(&dev->parent->power.lock); } idx = device_links_read_lock(); dev_for_each_link_to_supplier(link, dev) { spin_lock_irq(&link->supplier->power.lock); link->supplier->power.direct_complete = false; spin_unlock_irq(&link->supplier->power.lock); } device_links_read_unlock(idx); } static void async_suspend(void *data, async_cookie_t cookie); /** * device_suspend - Execute "suspend" callbacks for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being suspended asynchronously. */ static void device_suspend(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; DECLARE_DPM_WATCHDOG_ON_STACK(wd); TRACE_DEVICE(dev); TRACE_SUSPEND(0); dpm_wait_for_subordinate(dev, async); if (READ_ONCE(async_error)) { dev->power.direct_complete = false; goto Complete; } /* * Wait for possible runtime PM transitions of the device in progress * to complete and if there's a runtime resume request pending for it, * resume it before proceeding with invoking the system-wide suspend * callbacks for it. * * If the system-wide suspend callbacks below change the configuration * of the device, they must disable runtime PM for it or otherwise * ensure that its runtime-resume callbacks will not be confused by that * change in case they are invoked going forward. */ pm_runtime_barrier(dev); if (pm_wakeup_pending()) { dev->power.direct_complete = false; WRITE_ONCE(async_error, -EBUSY); goto Complete; } if (dev->power.syscore) goto Complete; /* Avoid direct_complete to let wakeup_path propagate. */ if (device_may_wakeup(dev) || device_wakeup_path(dev)) dev->power.direct_complete = false; if (dev->power.direct_complete) { if (pm_runtime_status_suspended(dev)) { pm_runtime_disable(dev); if (pm_runtime_status_suspended(dev)) { pm_dev_dbg(dev, state, "direct-complete "); dev->power.is_suspended = true; goto Complete; } pm_runtime_enable(dev); } dev->power.direct_complete = false; } dev->power.may_skip_resume = true; dev->power.must_resume = !dev_pm_test_driver_flags(dev, DPM_FLAG_MAY_SKIP_RESUME); dpm_watchdog_set(&wd, dev); device_lock(dev); if (dev->pm_domain) { info = "power domain "; callback = pm_op(&dev->pm_domain->ops, state); goto Run; } if (dev->type && dev->type->pm) { info = "type "; callback = pm_op(dev->type->pm, state); goto Run; } if (dev->class && dev->class->pm) { info = "class "; callback = pm_op(dev->class->pm, state); goto Run; } if (dev->bus) { if (dev->bus->pm) { info = "bus "; callback = pm_op(dev->bus->pm, state); } else if (dev->bus->suspend) { pm_dev_dbg(dev, state, "legacy bus "); error = legacy_suspend(dev, state, dev->bus->suspend, "legacy bus "); goto End; } } Run: if (!callback && dev->driver && dev->driver->pm) { info = "driver "; callback = pm_op(dev->driver->pm, state); } error = dpm_run_callback(callback, dev, state, info); End: if (!error) { dev->power.is_suspended = true; if (device_may_wakeup(dev)) dev->power.wakeup_path = true; dpm_propagate_wakeup_to_parent(dev); dpm_clear_superiors_direct_complete(dev); } device_unlock(dev); dpm_watchdog_clear(&wd); Complete: if (error) { WRITE_ONCE(async_error, error); dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async" : "", error); } complete_all(&dev->power.completion); TRACE_SUSPEND(error); if (error || READ_ONCE(async_error)) return; dpm_async_suspend_superior(dev, async_suspend); } static void async_suspend(void *data, async_cookie_t cookie) { struct device *dev = data; device_suspend(dev, pm_transition, true); put_device(dev); } /** * dpm_suspend - Execute "suspend" callbacks for all non-sysdev devices. * @state: PM transition of the system being carried out. */ int dpm_suspend(pm_message_t state) { ktime_t starttime = ktime_get(); struct device *dev; int error; trace_suspend_resume(TPS("dpm_suspend"), state.event, true); might_sleep(); devfreq_suspend(); cpufreq_suspend(); pm_transition = state; async_error = 0; mutex_lock(&dpm_list_mtx); /* * Start processing "async" leaf devices upfront so they don't need to * wait for the "sync" devices they don't depend on. */ list_for_each_entry_reverse(dev, &dpm_prepared_list, power.entry) { dpm_clear_async_state(dev); if (dpm_leaf_device(dev)) dpm_async_with_cleanup(dev, async_suspend); } while (!list_empty(&dpm_prepared_list)) { dev = to_device(dpm_prepared_list.prev); list_move(&dev->power.entry, &dpm_suspended_list); if (dpm_async_fn(dev, async_suspend)) continue; get_device(dev); mutex_unlock(&dpm_list_mtx); device_suspend(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); if (READ_ONCE(async_error)) { dpm_async_suspend_complete_all(&dpm_prepared_list); /* * Move all devices to the target list to resume them * properly. */ list_splice_init(&dpm_prepared_list, &dpm_suspended_list); break; } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); error = READ_ONCE(async_error); if (error) dpm_save_failed_step(SUSPEND_SUSPEND); dpm_show_time(starttime, state, error, NULL); trace_suspend_resume(TPS("dpm_suspend"), state.event, false); return error; } static bool device_prepare_smart_suspend(struct device *dev) { struct device_link *link; bool ret = true; int idx; /* * The "smart suspend" feature is enabled for devices whose drivers ask * for it and for devices without PM callbacks. * * However, if "smart suspend" is not enabled for the device's parent * or any of its suppliers that take runtime PM into account, it cannot * be enabled for the device either. */ if (!dev->power.no_pm_callbacks && !dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND)) return false; if (dev->parent && !dev_pm_smart_suspend(dev->parent) && !dev->parent->power.ignore_children && !pm_runtime_blocked(dev->parent)) return false; idx = device_links_read_lock(); dev_for_each_link_to_supplier(link, dev) { if (!device_link_test(link, DL_FLAG_PM_RUNTIME)) continue; if (!dev_pm_smart_suspend(link->supplier) && !pm_runtime_blocked(link->supplier)) { ret = false; break; } } device_links_read_unlock(idx); return ret; } /** * device_prepare - Prepare a device for system power transition. * @dev: Device to handle. * @state: PM transition of the system being carried out. * * Execute the ->prepare() callback(s) for given device. No new children of the * device may be registered after this function has returned. */ static int device_prepare(struct device *dev, pm_message_t state) { int (*callback)(struct device *) = NULL; bool smart_suspend; int ret = 0; /* * If a device's parent goes into runtime suspend at the wrong time, * it won't be possible to resume the device. To prevent this we * block runtime suspend here, during the prepare phase, and allow * it again during the complete phase. */ pm_runtime_get_noresume(dev); /* * If runtime PM is disabled for the device at this point and it has * never been enabled so far, it should not be enabled until this system * suspend-resume cycle is complete, so prepare to trigger a warning on * subsequent attempts to enable it. */ smart_suspend = !pm_runtime_block_if_disabled(dev); if (dev->power.syscore) return 0; device_lock(dev); dev->power.wakeup_path = false; if (dev->power.no_pm_callbacks) goto unlock; if (dev->pm_domain) callback = dev->pm_domain->ops.prepare; else if (dev->type && dev->type->pm) callback = dev->type->pm->prepare; else if (dev->class && dev->class->pm) callback = dev->class->pm->prepare; else if (dev->bus && dev->bus->pm) callback = dev->bus->pm->prepare; if (!callback && dev->driver && dev->driver->pm) callback = dev->driver->pm->prepare; if (callback) ret = callback(dev); unlock: device_unlock(dev); if (ret < 0) { suspend_report_result(dev, callback, ret); pm_runtime_put(dev); return ret; } /* Do not enable "smart suspend" for devices with disabled runtime PM. */ if (smart_suspend) smart_suspend = device_prepare_smart_suspend(dev); spin_lock_irq(&dev->power.lock); dev->power.smart_suspend = smart_suspend; /* * A positive return value from ->prepare() means "this device appears * to be runtime-suspended and its state is fine, so if it really is * runtime-suspended, you can leave it in that state provided that you * will do the same thing with all of its descendants". This only * applies to suspend transitions, however. */ dev->power.direct_complete = state.event == PM_EVENT_SUSPEND && (ret > 0 || dev->power.no_pm_callbacks) && !dev_pm_test_driver_flags(dev, DPM_FLAG_NO_DIRECT_COMPLETE); spin_unlock_irq(&dev->power.lock); return 0; } /** * dpm_prepare - Prepare all non-sysdev devices for a system PM transition. * @state: PM transition of the system being carried out. * * Execute the ->prepare() callback(s) for all devices. */ int dpm_prepare(pm_message_t state) { int error = 0; trace_suspend_resume(TPS("dpm_prepare"), state.event, true); /* * Give a chance for the known devices to complete their probes, before * disable probing of devices. This sync point is important at least * at boot time + hibernation restore. */ wait_for_device_probe(); /* * It is unsafe if probing of devices will happen during suspend or * hibernation and system behavior will be unpredictable in this case. * So, let's prohibit device's probing here and defer their probes * instead. The normal behavior will be restored in dpm_complete(). */ device_block_probing(); mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_list) && !error) { struct device *dev = to_device(dpm_list.next); get_device(dev); mutex_unlock(&dpm_list_mtx); trace_device_pm_callback_start(dev, "", state.event); error = device_prepare(dev, state); trace_device_pm_callback_end(dev, error); mutex_lock(&dpm_list_mtx); if (!error) { dev->power.is_prepared = true; if (!list_empty(&dev->power.entry)) list_move_tail(&dev->power.entry, &dpm_prepared_list); } else if (error == -EAGAIN) { error = 0; } else { dev_info(dev, "not prepared for power transition: code %d\n", error); } mutex_unlock(&dpm_list_mtx); put_device(dev); mutex_lock(&dpm_list_mtx); } mutex_unlock(&dpm_list_mtx); trace_suspend_resume(TPS("dpm_prepare"), state.event, false); return error; } /** * dpm_suspend_start - Prepare devices for PM transition and suspend them. * @state: PM transition of the system being carried out. * * Prepare all non-sysdev devices for system PM transition and execute "suspend" * callbacks for them. */ int dpm_suspend_start(pm_message_t state) { ktime_t starttime = ktime_get(); int error; error = dpm_prepare(state); if (error) dpm_save_failed_step(SUSPEND_PREPARE); else { pm_restrict_gfp_mask(); error = dpm_suspend(state); } dpm_show_time(starttime, state, error, "start"); return error; } EXPORT_SYMBOL_GPL(dpm_suspend_start); void __suspend_report_result(const char *function, struct device *dev, void *fn, int ret) { if (ret) dev_err(dev, "%s(): %ps returns %d\n", function, fn, ret); } EXPORT_SYMBOL_GPL(__suspend_report_result); /** * device_pm_wait_for_dev - Wait for suspend/resume of a device to complete. * @subordinate: Device that needs to wait for @dev. * @dev: Device to wait for. */ int device_pm_wait_for_dev(struct device *subordinate, struct device *dev) { dpm_wait(dev, subordinate->power.async_suspend); return async_error; } EXPORT_SYMBOL_GPL(device_pm_wait_for_dev); /** * dpm_for_each_dev - device iterator. * @data: data for the callback. * @fn: function to be called for each device. * * Iterate over devices in dpm_list, and call @fn for each device, * passing it @data. */ void dpm_for_each_dev(void *data, void (*fn)(struct device *, void *)) { struct device *dev; if (!fn) return; device_pm_lock(); list_for_each_entry(dev, &dpm_list, power.entry) fn(dev, data); device_pm_unlock(); } EXPORT_SYMBOL_GPL(dpm_for_each_dev); static bool pm_ops_is_empty(const struct dev_pm_ops *ops) { if (!ops) return true; return !ops->prepare && !ops->suspend && !ops->suspend_late && !ops->suspend_noirq && !ops->resume_noirq && !ops->resume_early && !ops->resume && !ops->complete; } void device_pm_check_callbacks(struct device *dev) { unsigned long flags; spin_lock_irqsave(&dev->power.lock, flags); dev->power.no_pm_callbacks = (!dev->bus || (pm_ops_is_empty(dev->bus->pm) && !dev->bus->suspend && !dev->bus->resume)) && (!dev->class || pm_ops_is_empty(dev->class->pm)) && (!dev->type || pm_ops_is_empty(dev->type->pm)) && (!dev->pm_domain || pm_ops_is_empty(&dev->pm_domain->ops)) && (!dev->driver || (pm_ops_is_empty(dev->driver->pm) && !dev->driver->suspend && !dev->driver->resume)); spin_unlock_irqrestore(&dev->power.lock, flags); } bool dev_pm_skip_suspend(struct device *dev) { return dev_pm_smart_suspend(dev) && pm_runtime_status_suspended(dev); }
3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. */ #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> #include <linux/xattr.h> #include <linux/gfs2_ondisk.h> #include <linux/posix_acl_xattr.h> #include <linux/uaccess.h> #include "gfs2.h" #include "incore.h" #include "acl.h" #include "xattr.h" #include "glock.h" #include "inode.h" #include "meta_io.h" #include "quota.h" #include "rgrp.h" #include "super.h" #include "trans.h" #include "util.h" /* * ea_calc_size - returns the actual number of bytes the request will take up * (not counting any unstuffed data blocks) * * Returns: 1 if the EA should be stuffed */ static int ea_calc_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize, unsigned int *size) { unsigned int jbsize = sdp->sd_jbsize; /* Stuffed */ *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + dsize, 8); if (*size <= jbsize) return 1; /* Unstuffed */ *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + (sizeof(__be64) * DIV_ROUND_UP(dsize, jbsize)), 8); return 0; } static int ea_check_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize) { unsigned int size; if (dsize > GFS2_EA_MAX_DATA_LEN) return -ERANGE; ea_calc_size(sdp, nsize, dsize, &size); /* This can only happen with 512 byte blocks */ if (size > sdp->sd_jbsize) return -ERANGE; return 0; } static bool gfs2_eatype_valid(struct gfs2_sbd *sdp, u8 type) { switch(sdp->sd_sb.sb_fs_format) { case GFS2_FS_FORMAT_MAX: return true; case GFS2_FS_FORMAT_MIN: return type <= GFS2_EATYPE_SECURITY; default: return false; } } typedef int (*ea_call_t) (struct gfs2_inode *ip, struct buffer_head *bh, struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, void *private); static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh, ea_call_t ea_call, void *data) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_ea_header *ea, *prev = NULL; int error = 0; if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_EA)) return -EIO; for (ea = GFS2_EA_BH2FIRST(bh);; prev = ea, ea = GFS2_EA2NEXT(ea)) { if (!GFS2_EA_REC_LEN(ea)) { gfs2_consist_inode(ip); return -EIO; } if (!(bh->b_data <= (char *)ea && (char *)GFS2_EA2NEXT(ea) <= bh->b_data + bh->b_size)) { gfs2_consist_inode(ip); return -EIO; } if (!gfs2_eatype_valid(sdp, ea->ea_type)) { gfs2_consist_inode(ip); return -EIO; } error = ea_call(ip, bh, ea, prev, data); if (error) return error; if (GFS2_EA_IS_LAST(ea)) { if ((char *)GFS2_EA2NEXT(ea) != bh->b_data + bh->b_size) { gfs2_consist_inode(ip); return -EIO; } break; } } return error; } static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data) { struct buffer_head *bh, *eabh; __be64 *eablk, *end; int error; error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, &bh); if (error) return error; if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) { error = ea_foreach_i(ip, bh, ea_call, data); goto out; } if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_IN)) { error = -EIO; goto out; } eablk = (__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)); end = eablk + GFS2_SB(&ip->i_inode)->sd_inptrs; for (; eablk < end; eablk++) { u64 bn; if (!*eablk) break; bn = be64_to_cpu(*eablk); error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, 0, &eabh); if (error) break; error = ea_foreach_i(ip, eabh, ea_call, data); brelse(eabh); if (error) break; } out: brelse(bh); return error; } struct ea_find { int type; const char *name; size_t namel; struct gfs2_ea_location *ef_el; }; static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh, struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, void *private) { struct ea_find *ef = private; if (ea->ea_type == GFS2_EATYPE_UNUSED) return 0; if (ea->ea_type == ef->type) { if (ea->ea_name_len == ef->namel && !memcmp(GFS2_EA2NAME(ea), ef->name, ea->ea_name_len)) { struct gfs2_ea_location *el = ef->ef_el; get_bh(bh); el->el_bh = bh; el->el_ea = ea; el->el_prev = prev; return 1; } } return 0; } static int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name, struct gfs2_ea_location *el) { struct ea_find ef; int error; ef.type = type; ef.name = name; ef.namel = strlen(name); ef.ef_el = el; memset(el, 0, sizeof(struct gfs2_ea_location)); error = ea_foreach(ip, ea_find_i, &ef); if (error > 0) return 0; return error; } /* * ea_dealloc_unstuffed * * Take advantage of the fact that all unstuffed blocks are * allocated from the same RG. But watch, this may not always * be true. * * Returns: errno */ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, void *private) { int *leave = private; struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd; struct gfs2_holder rg_gh; __be64 *dataptrs; u64 bn = 0; u64 bstart = 0; unsigned int blen = 0; unsigned int blks = 0; unsigned int x; int error; error = gfs2_rindex_update(sdp); if (error) return error; if (GFS2_EA_IS_STUFFED(ea)) return 0; dataptrs = GFS2_EA2DATAPTRS(ea); for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) { if (*dataptrs) { blks++; bn = be64_to_cpu(*dataptrs); } } if (!blks) return 0; rgd = gfs2_blk2rgrpd(sdp, bn, 1); if (!rgd) { gfs2_consist_inode(ip); return -EIO; } error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, LM_FLAG_NODE_SCOPE, &rg_gh); if (error) return error; error = gfs2_trans_begin(sdp, rgd->rd_length + RES_DINODE + RES_EATTR + RES_STATFS + RES_QUOTA, blks); if (error) goto out_gunlock; gfs2_trans_add_meta(ip->i_gl, bh); dataptrs = GFS2_EA2DATAPTRS(ea); for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) { if (!*dataptrs) break; bn = be64_to_cpu(*dataptrs); if (bstart + blen == bn) blen++; else { if (bstart) gfs2_free_meta(ip, rgd, bstart, blen); bstart = bn; blen = 1; } *dataptrs = 0; gfs2_add_inode_blocks(&ip->i_inode, -1); } if (bstart) gfs2_free_meta(ip, rgd, bstart, blen); if (prev && !leave) { u32 len; len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea); prev->ea_rec_len = cpu_to_be32(len); if (GFS2_EA_IS_LAST(ea)) prev->ea_flags |= GFS2_EAFLAG_LAST; } else { ea->ea_type = GFS2_EATYPE_UNUSED; ea->ea_num_ptrs = 0; } inode_set_ctime_current(&ip->i_inode); __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC); gfs2_trans_end(sdp); out_gunlock: gfs2_glock_dq_uninit(&rg_gh); return error; } static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, int leave) { int error; error = gfs2_rindex_update(GFS2_SB(&ip->i_inode)); if (error) return error; error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (error) goto out_alloc; error = ea_dealloc_unstuffed(ip, bh, ea, prev, (leave) ? &error : NULL); gfs2_quota_unhold(ip); out_alloc: return error; } struct ea_list { struct gfs2_ea_request *ei_er; unsigned int ei_size; }; static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh, struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, void *private) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct ea_list *ei = private; struct gfs2_ea_request *er = ei->ei_er; unsigned int ea_size; char *prefix; unsigned int l; if (ea->ea_type == GFS2_EATYPE_UNUSED) return 0; BUG_ON(ea->ea_type > GFS2_EATYPE_SECURITY && sdp->sd_sb.sb_fs_format == GFS2_FS_FORMAT_MIN); switch (ea->ea_type) { case GFS2_EATYPE_USR: prefix = "user."; l = 5; break; case GFS2_EATYPE_SYS: prefix = "system."; l = 7; break; case GFS2_EATYPE_SECURITY: prefix = "security."; l = 9; break; case GFS2_EATYPE_TRUSTED: prefix = "trusted."; l = 8; break; default: return 0; } ea_size = l + ea->ea_name_len + 1; if (er->er_data_len) { if (ei->ei_size + ea_size > er->er_data_len) return -ERANGE; memcpy(er->er_data + ei->ei_size, prefix, l); memcpy(er->er_data + ei->ei_size + l, GFS2_EA2NAME(ea), ea->ea_name_len); er->er_data[ei->ei_size + ea_size - 1] = 0; } ei->ei_size += ea_size; return 0; } /** * gfs2_listxattr - List gfs2 extended attributes * @dentry: The dentry whose inode we are interested in * @buffer: The buffer to write the results * @size: The size of the buffer * * Returns: actual size of data on success, -errno on error */ ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size) { struct gfs2_inode *ip = GFS2_I(d_inode(dentry)); struct gfs2_ea_request er; struct gfs2_holder i_gh; int error; memset(&er, 0, sizeof(struct gfs2_ea_request)); if (size) { er.er_data = buffer; er.er_data_len = size; } error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (error) return error; if (ip->i_eattr) { struct ea_list ei = { .ei_er = &er, .ei_size = 0 }; error = ea_foreach(ip, ea_list_i, &ei); if (!error) error = ei.ei_size; } gfs2_glock_dq_uninit(&i_gh); return error; } /** * gfs2_iter_unstuffed - copies the unstuffed xattr data to/from the * request buffer * @ip: The GFS2 inode * @ea: The extended attribute header structure * @din: The data to be copied in * @dout: The data to be copied out (one of din,dout will be NULL) * * Returns: errno */ static int gfs2_iter_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea, const char *din, char *dout) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct buffer_head **bh; unsigned int amount = GFS2_EA_DATA_LEN(ea); unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize); __be64 *dataptrs = GFS2_EA2DATAPTRS(ea); unsigned int x; int error = 0; unsigned char *pos; unsigned cp_size; bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS); if (!bh) return -ENOMEM; for (x = 0; x < nptrs; x++) { error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0, 0, bh + x); if (error) { while (x--) brelse(bh[x]); goto out; } dataptrs++; } for (x = 0; x < nptrs; x++) { error = gfs2_meta_wait(sdp, bh[x]); if (error) { for (; x < nptrs; x++) brelse(bh[x]); goto out; } if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) { for (; x < nptrs; x++) brelse(bh[x]); error = -EIO; goto out; } pos = bh[x]->b_data + sizeof(struct gfs2_meta_header); cp_size = (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize; if (dout) { memcpy(dout, pos, cp_size); dout += sdp->sd_jbsize; } if (din) { gfs2_trans_add_meta(ip->i_gl, bh[x]); memcpy(pos, din, cp_size); din += sdp->sd_jbsize; } amount -= sdp->sd_jbsize; brelse(bh[x]); } out: kfree(bh); return error; } static int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el, char *data, size_t size) { int ret; size_t len = GFS2_EA_DATA_LEN(el->el_ea); if (len > size) return -ERANGE; if (GFS2_EA_IS_STUFFED(el->el_ea)) { memcpy(data, GFS2_EA2DATA(el->el_ea), len); return len; } ret = gfs2_iter_unstuffed(ip, el->el_ea, NULL, data); if (ret < 0) return ret; return len; } int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **ppdata) { struct gfs2_ea_location el; int error; int len; char *data; error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, &el); if (error) return error; if (!el.el_ea) goto out; if (!GFS2_EA_DATA_LEN(el.el_ea)) goto out; len = GFS2_EA_DATA_LEN(el.el_ea); data = kmalloc(len, GFP_NOFS); error = -ENOMEM; if (data == NULL) goto out; error = gfs2_ea_get_copy(ip, &el, data, len); if (error < 0) kfree(data); else *ppdata = data; out: brelse(el.el_bh); return error; } /** * __gfs2_xattr_get - Get a GFS2 extended attribute * @inode: The inode * @name: The name of the extended attribute * @buffer: The buffer to write the result into * @size: The size of the buffer * @type: The type of extended attribute * * Returns: actual size of data on success, -errno on error */ static int __gfs2_xattr_get(struct inode *inode, const char *name, void *buffer, size_t size, int type) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_ea_location el; int error; if (!ip->i_eattr) return -ENODATA; if (strlen(name) > GFS2_EA_MAX_NAME_LEN) return -EINVAL; error = gfs2_ea_find(ip, type, name, &el); if (error) return error; if (!el.el_ea) return -ENODATA; if (size) error = gfs2_ea_get_copy(ip, &el, buffer, size); else error = GFS2_EA_DATA_LEN(el.el_ea); brelse(el.el_bh); return error; } static int gfs2_xattr_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *buffer, size_t size) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int ret; /* During lookup, SELinux calls this function with the glock locked. */ if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); if (ret) return ret; } else { gfs2_holder_mark_uninitialized(&gh); } ret = __gfs2_xattr_get(inode, name, buffer, size, handler->flags); if (gfs2_holder_initialized(&gh)) gfs2_glock_dq_uninit(&gh); return ret; } /** * ea_alloc_blk - allocates a new block for extended attributes. * @ip: A pointer to the inode that's getting extended attributes * @bhp: Pointer to pointer to a struct buffer_head * * Returns: errno */ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_ea_header *ea; unsigned int n = 1; u64 block; int error; error = gfs2_alloc_blocks(ip, &block, &n, 0); if (error) return error; gfs2_trans_remove_revoke(sdp, block, 1); *bhp = gfs2_meta_new(ip->i_gl, block); gfs2_trans_add_meta(ip->i_gl, *bhp); gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA); gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header)); ea = GFS2_EA_BH2FIRST(*bhp); ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize); ea->ea_type = GFS2_EATYPE_UNUSED; ea->ea_flags = GFS2_EAFLAG_LAST; ea->ea_num_ptrs = 0; gfs2_add_inode_blocks(&ip->i_inode, 1); return 0; } /** * ea_write - writes the request info to an ea, creating new blocks if * necessary * @ip: inode that is being modified * @ea: the location of the new ea in a block * @er: the write request * * Note: does not update ea_rec_len or the GFS2_EAFLAG_LAST bin of ea_flags * * returns : errno */ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea, struct gfs2_ea_request *er) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); int error; ea->ea_data_len = cpu_to_be32(er->er_data_len); ea->ea_name_len = er->er_name_len; ea->ea_type = er->er_type; ea->__pad = 0; memcpy(GFS2_EA2NAME(ea), er->er_name, er->er_name_len); if (GFS2_EAREQ_SIZE_STUFFED(er) <= sdp->sd_jbsize) { ea->ea_num_ptrs = 0; memcpy(GFS2_EA2DATA(ea), er->er_data, er->er_data_len); } else { __be64 *dataptr = GFS2_EA2DATAPTRS(ea); const char *data = er->er_data; unsigned int data_len = er->er_data_len; unsigned int copy; unsigned int x; ea->ea_num_ptrs = DIV_ROUND_UP(er->er_data_len, sdp->sd_jbsize); for (x = 0; x < ea->ea_num_ptrs; x++) { struct buffer_head *bh; u64 block; int mh_size = sizeof(struct gfs2_meta_header); unsigned int n = 1; error = gfs2_alloc_blocks(ip, &block, &n, 0); if (error) return error; gfs2_trans_remove_revoke(sdp, block, 1); bh = gfs2_meta_new(ip->i_gl, block); gfs2_trans_add_meta(ip->i_gl, bh); gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED); gfs2_add_inode_blocks(&ip->i_inode, 1); copy = data_len > sdp->sd_jbsize ? sdp->sd_jbsize : data_len; memcpy(bh->b_data + mh_size, data, copy); if (copy < sdp->sd_jbsize) memset(bh->b_data + mh_size + copy, 0, sdp->sd_jbsize - copy); *dataptr++ = cpu_to_be64(bh->b_blocknr); data += copy; data_len -= copy; brelse(bh); } gfs2_assert_withdraw(sdp, !data_len); } return 0; } typedef int (*ea_skeleton_call_t) (struct gfs2_inode *ip, struct gfs2_ea_request *er, void *private); static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, unsigned int blks, ea_skeleton_call_t skeleton_call, void *private) { struct gfs2_alloc_parms ap = { .target = blks }; int error; error = gfs2_rindex_update(GFS2_SB(&ip->i_inode)); if (error) return error; error = gfs2_quota_lock_check(ip, &ap); if (error) return error; error = gfs2_inplace_reserve(ip, &ap); if (error) goto out_gunlock_q; error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), blks + gfs2_rg_blocks(ip, blks) + RES_DINODE + RES_STATFS + RES_QUOTA, 0); if (error) goto out_ipres; error = skeleton_call(ip, er, private); if (error) goto out_end_trans; inode_set_ctime_current(&ip->i_inode); __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC); out_end_trans: gfs2_trans_end(GFS2_SB(&ip->i_inode)); out_ipres: gfs2_inplace_release(ip); out_gunlock_q: gfs2_quota_unlock(ip); return error; } static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, void *private) { struct buffer_head *bh; int error; error = ea_alloc_blk(ip, &bh); if (error) return error; ip->i_eattr = bh->b_blocknr; error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er); brelse(bh); return error; } /* * ea_init - initializes a new eattr block * * Returns: errno */ static int ea_init(struct gfs2_inode *ip, int type, const char *name, const void *data, size_t size) { struct gfs2_ea_request er; unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize; unsigned int blks = 1; er.er_type = type; er.er_name = name; er.er_name_len = strlen(name); er.er_data = (void *)data; er.er_data_len = size; if (GFS2_EAREQ_SIZE_STUFFED(&er) > jbsize) blks += DIV_ROUND_UP(er.er_data_len, jbsize); return ea_alloc_skeleton(ip, &er, blks, ea_init_i, NULL); } static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea) { u32 ea_size = GFS2_EA_SIZE(ea); struct gfs2_ea_header *new = (struct gfs2_ea_header *)((char *)ea + ea_size); u32 new_size = GFS2_EA_REC_LEN(ea) - ea_size; int last = ea->ea_flags & GFS2_EAFLAG_LAST; ea->ea_rec_len = cpu_to_be32(ea_size); ea->ea_flags ^= last; new->ea_rec_len = cpu_to_be32(new_size); new->ea_flags = last; return new; } static void ea_set_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) { struct gfs2_ea_header *ea = el->el_ea; struct gfs2_ea_header *prev = el->el_prev; u32 len; gfs2_trans_add_meta(ip->i_gl, el->el_bh); if (!prev || !GFS2_EA_IS_STUFFED(ea)) { ea->ea_type = GFS2_EATYPE_UNUSED; return; } else if (GFS2_EA2NEXT(prev) != ea) { prev = GFS2_EA2NEXT(prev); gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), GFS2_EA2NEXT(prev) == ea); } len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea); prev->ea_rec_len = cpu_to_be32(len); if (GFS2_EA_IS_LAST(ea)) prev->ea_flags |= GFS2_EAFLAG_LAST; } struct ea_set { int ea_split; struct gfs2_ea_request *es_er; struct gfs2_ea_location *es_el; struct buffer_head *es_bh; struct gfs2_ea_header *es_ea; }; static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh, struct gfs2_ea_header *ea, struct ea_set *es) { struct gfs2_ea_request *er = es->es_er; int error; error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + 2 * RES_EATTR, 0); if (error) return error; gfs2_trans_add_meta(ip->i_gl, bh); if (es->ea_split) ea = ea_split_ea(ea); ea_write(ip, ea, er); if (es->es_el) ea_set_remove_stuffed(ip, es->es_el); inode_set_ctime_current(&ip->i_inode); __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC); gfs2_trans_end(GFS2_SB(&ip->i_inode)); return error; } static int ea_set_simple_alloc(struct gfs2_inode *ip, struct gfs2_ea_request *er, void *private) { struct ea_set *es = private; struct gfs2_ea_header *ea = es->es_ea; int error; gfs2_trans_add_meta(ip->i_gl, es->es_bh); if (es->ea_split) ea = ea_split_ea(ea); error = ea_write(ip, ea, er); if (error) return error; if (es->es_el) ea_set_remove_stuffed(ip, es->es_el); return 0; } static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh, struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, void *private) { struct ea_set *es = private; unsigned int size; int stuffed; int error; stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er->er_name_len, es->es_er->er_data_len, &size); if (ea->ea_type == GFS2_EATYPE_UNUSED) { if (GFS2_EA_REC_LEN(ea) < size) return 0; if (!GFS2_EA_IS_STUFFED(ea)) { error = ea_remove_unstuffed(ip, bh, ea, prev, 1); if (error) return error; } es->ea_split = 0; } else if (GFS2_EA_REC_LEN(ea) - GFS2_EA_SIZE(ea) >= size) es->ea_split = 1; else return 0; if (stuffed) { error = ea_set_simple_noalloc(ip, bh, ea, es); if (error) return error; } else { unsigned int blks; es->es_bh = bh; es->es_ea = ea; blks = 2 + DIV_ROUND_UP(es->es_er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize); error = ea_alloc_skeleton(ip, es->es_er, blks, ea_set_simple_alloc, es); if (error) return error; } return 1; } static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, void *private) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct buffer_head *indbh, *newbh; __be64 *eablk; int error; int mh_size = sizeof(struct gfs2_meta_header); if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) { __be64 *end; error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, &indbh); if (error) return error; if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) { error = -EIO; goto out; } eablk = (__be64 *)(indbh->b_data + mh_size); end = eablk + sdp->sd_inptrs; for (; eablk < end; eablk++) if (!*eablk) break; if (eablk == end) { error = -ENOSPC; goto out; } gfs2_trans_add_meta(ip->i_gl, indbh); } else { u64 blk; unsigned int n = 1; error = gfs2_alloc_blocks(ip, &blk, &n, 0); if (error) return error; gfs2_trans_remove_revoke(sdp, blk, 1); indbh = gfs2_meta_new(ip->i_gl, blk); gfs2_trans_add_meta(ip->i_gl, indbh); gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN); gfs2_buffer_clear_tail(indbh, mh_size); eablk = (__be64 *)(indbh->b_data + mh_size); *eablk = cpu_to_be64(ip->i_eattr); ip->i_eattr = blk; ip->i_diskflags |= GFS2_DIF_EA_INDIRECT; gfs2_add_inode_blocks(&ip->i_inode, 1); eablk++; } error = ea_alloc_blk(ip, &newbh); if (error) goto out; *eablk = cpu_to_be64((u64)newbh->b_blocknr); error = ea_write(ip, GFS2_EA_BH2FIRST(newbh), er); brelse(newbh); if (error) goto out; if (private) ea_set_remove_stuffed(ip, private); out: brelse(indbh); return error; } static int ea_set_i(struct gfs2_inode *ip, int type, const char *name, const void *value, size_t size, struct gfs2_ea_location *el) { struct gfs2_ea_request er; struct ea_set es; unsigned int blks = 2; int error; er.er_type = type; er.er_name = name; er.er_data = (void *)value; er.er_name_len = strlen(name); er.er_data_len = size; memset(&es, 0, sizeof(struct ea_set)); es.es_er = &er; es.es_el = el; error = ea_foreach(ip, ea_set_simple, &es); if (error > 0) return 0; if (error) return error; if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) blks++; if (GFS2_EAREQ_SIZE_STUFFED(&er) > GFS2_SB(&ip->i_inode)->sd_jbsize) blks += DIV_ROUND_UP(er.er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize); return ea_alloc_skeleton(ip, &er, blks, ea_set_block, el); } static int ea_set_remove_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) { if (el->el_prev && GFS2_EA2NEXT(el->el_prev) != el->el_ea) { el->el_prev = GFS2_EA2NEXT(el->el_prev); gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), GFS2_EA2NEXT(el->el_prev) == el->el_ea); } return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev, 0); } static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) { struct gfs2_ea_header *ea = el->el_ea; struct gfs2_ea_header *prev = el->el_prev; int error; error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0); if (error) return error; gfs2_trans_add_meta(ip->i_gl, el->el_bh); if (prev) { u32 len; len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea); prev->ea_rec_len = cpu_to_be32(len); if (GFS2_EA_IS_LAST(ea)) prev->ea_flags |= GFS2_EAFLAG_LAST; } else { ea->ea_type = GFS2_EATYPE_UNUSED; } inode_set_ctime_current(&ip->i_inode); __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC); gfs2_trans_end(GFS2_SB(&ip->i_inode)); return error; } /** * gfs2_xattr_remove - Remove a GFS2 extended attribute * @ip: The inode * @type: The type of the extended attribute * @name: The name of the extended attribute * * This is not called directly by the VFS since we use the (common) * scheme of making a "set with NULL data" mean a remove request. Note * that this is different from a set with zero length data. * * Returns: 0, or errno on failure */ static int gfs2_xattr_remove(struct gfs2_inode *ip, int type, const char *name) { struct gfs2_ea_location el; int error; if (!ip->i_eattr) return -ENODATA; error = gfs2_ea_find(ip, type, name, &el); if (error) return error; if (!el.el_ea) return -ENODATA; if (GFS2_EA_IS_STUFFED(el.el_ea)) error = ea_remove_stuffed(ip, &el); else error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, 0); brelse(el.el_bh); return error; } /** * __gfs2_xattr_set - Set (or remove) a GFS2 extended attribute * @inode: The inode * @name: The name of the extended attribute * @value: The value of the extended attribute (NULL for remove) * @size: The size of the @value argument * @flags: Create or Replace * @type: The type of the extended attribute * * See gfs2_xattr_remove() for details of the removal of xattrs. * * Returns: 0 or errno on failure */ int __gfs2_xattr_set(struct inode *inode, const char *name, const void *value, size_t size, int flags, int type) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_ea_location el; unsigned int namel = strlen(name); int error; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; if (namel > GFS2_EA_MAX_NAME_LEN) return -ERANGE; if (value == NULL) { error = gfs2_xattr_remove(ip, type, name); if (error == -ENODATA && !(flags & XATTR_REPLACE)) error = 0; return error; } if (ea_check_size(sdp, namel, size)) return -ERANGE; if (!ip->i_eattr) { if (flags & XATTR_REPLACE) return -ENODATA; return ea_init(ip, type, name, value, size); } error = gfs2_ea_find(ip, type, name, &el); if (error) return error; if (el.el_ea) { if (ip->i_diskflags & GFS2_DIF_APPENDONLY) { brelse(el.el_bh); return -EPERM; } error = -EEXIST; if (!(flags & XATTR_CREATE)) { int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea); error = ea_set_i(ip, type, name, value, size, &el); if (!error && unstuffed) ea_set_remove_unstuffed(ip, &el); } brelse(el.el_bh); return error; } error = -ENODATA; if (!(flags & XATTR_REPLACE)) error = ea_set_i(ip, type, name, value, size, NULL); return error; } static int gfs2_xattr_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int ret; ret = gfs2_qa_get(ip); if (ret) return ret; /* May be called from gfs_setattr with the glock locked. */ if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); if (ret) goto out; } else { if (WARN_ON_ONCE(ip->i_gl->gl_state != LM_ST_EXCLUSIVE)) { ret = -EIO; goto out; } gfs2_holder_mark_uninitialized(&gh); } ret = __gfs2_xattr_set(inode, name, value, size, flags, handler->flags); if (gfs2_holder_initialized(&gh)) gfs2_glock_dq_uninit(&gh); out: gfs2_qa_put(ip); return ret; } static int ea_dealloc_indirect(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrp_list rlist; struct gfs2_rgrpd *rgd; struct buffer_head *indbh, *dibh; __be64 *eablk, *end; unsigned int rg_blocks = 0; u64 bstart = 0; unsigned int blen = 0; unsigned int blks = 0; unsigned int x; int error; error = gfs2_rindex_update(sdp); if (error) return error; memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, &indbh); if (error) return error; if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) { error = -EIO; goto out; } eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header)); end = eablk + sdp->sd_inptrs; for (; eablk < end; eablk++) { u64 bn; if (!*eablk) break; bn = be64_to_cpu(*eablk); if (bstart + blen == bn) blen++; else { if (bstart) gfs2_rlist_add(ip, &rlist, bstart); bstart = bn; blen = 1; } blks++; } if (bstart) gfs2_rlist_add(ip, &rlist, bstart); else goto out; gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, LM_FLAG_NODE_SCOPE); for (x = 0; x < rlist.rl_rgrps; x++) { rgd = gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl); rg_blocks += rgd->rd_length; } error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs); if (error) goto out_rlist_free; error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + RES_INDIRECT + RES_STATFS + RES_QUOTA, blks); if (error) goto out_gunlock; gfs2_trans_add_meta(ip->i_gl, indbh); eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header)); bstart = 0; rgd = NULL; blen = 0; for (; eablk < end; eablk++) { u64 bn; if (!*eablk) break; bn = be64_to_cpu(*eablk); if (bstart + blen == bn) blen++; else { if (bstart) gfs2_free_meta(ip, rgd, bstart, blen); bstart = bn; rgd = gfs2_blk2rgrpd(sdp, bstart, true); blen = 1; } *eablk = 0; gfs2_add_inode_blocks(&ip->i_inode, -1); } if (bstart) gfs2_free_meta(ip, rgd, bstart, blen); ip->i_diskflags &= ~GFS2_DIF_EA_INDIRECT; error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); } gfs2_trans_end(sdp); out_gunlock: gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs); out_rlist_free: gfs2_rlist_free(&rlist); out: brelse(indbh); return error; } static int ea_dealloc_block(struct gfs2_inode *ip, bool initialized) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd; struct buffer_head *dibh; struct gfs2_holder gh; int error; error = gfs2_rindex_update(sdp); if (error) return error; rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr, 1); if (!rgd) { gfs2_consist_inode(ip); return -EIO; } error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, LM_FLAG_NODE_SCOPE, &gh); if (error) return error; error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_DINODE + RES_STATFS + RES_QUOTA, 1); if (error) goto out_gunlock; gfs2_free_meta(ip, rgd, ip->i_eattr, 1); ip->i_eattr = 0; gfs2_add_inode_blocks(&ip->i_inode, -1); if (initialized) { error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); } } gfs2_trans_end(sdp); out_gunlock: gfs2_glock_dq_uninit(&gh); return error; } /** * gfs2_ea_dealloc - deallocate the extended attribute fork * @ip: the inode * @initialized: xattrs have been initialized * * Returns: errno */ int gfs2_ea_dealloc(struct gfs2_inode *ip, bool initialized) { int error; error = gfs2_rindex_update(GFS2_SB(&ip->i_inode)); if (error) return error; error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (error) return error; if (initialized) { error = ea_foreach(ip, ea_dealloc_unstuffed, NULL); if (error) goto out_quota; if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) { error = ea_dealloc_indirect(ip); if (error) goto out_quota; } } error = ea_dealloc_block(ip, initialized); out_quota: gfs2_quota_unhold(ip); return error; } static const struct xattr_handler gfs2_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .flags = GFS2_EATYPE_USR, .get = gfs2_xattr_get, .set = gfs2_xattr_set, }; static const struct xattr_handler gfs2_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .flags = GFS2_EATYPE_SECURITY, .get = gfs2_xattr_get, .set = gfs2_xattr_set, }; static bool gfs2_xattr_trusted_list(struct dentry *dentry) { return capable(CAP_SYS_ADMIN); } static const struct xattr_handler gfs2_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .flags = GFS2_EATYPE_TRUSTED, .list = gfs2_xattr_trusted_list, .get = gfs2_xattr_get, .set = gfs2_xattr_set, }; const struct xattr_handler * const gfs2_xattr_handlers_max[] = { /* GFS2_FS_FORMAT_MAX */ &gfs2_xattr_trusted_handler, /* GFS2_FS_FORMAT_MIN */ &gfs2_xattr_user_handler, &gfs2_xattr_security_handler, NULL, }; const struct xattr_handler * const *gfs2_xattr_handlers_min = gfs2_xattr_handlers_max + 1;
1 1 1 1 1 1 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 // SPDX-License-Identifier: GPL-2.0-only /* * The NFC Controller Interface is the communication protocol between an * NFC Controller (NFCC) and a Device Host (DH). * * Copyright (C) 2014 Marvell International Ltd. * Copyright (C) 2011 Texas Instruments, Inc. * * Written by Ilan Elias <ilane@ti.com> * * Acknowledgements: * This file is based on hci_event.c, which was written * by Maxim Krasnyansky. */ #define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__ #include <linux/types.h> #include <linux/interrupt.h> #include <linux/bitops.h> #include <linux/skbuff.h> #include "../nfc.h" #include <net/nfc/nci.h> #include <net/nfc/nci_core.h> #include <linux/nfc.h> /* Handle NCI Notification packets */ static int nci_core_reset_ntf_packet(struct nci_dev *ndev, const struct sk_buff *skb) { /* Handle NCI 2.x core reset notification */ const struct nci_core_reset_ntf *ntf; if (skb->len < sizeof(struct nci_core_reset_ntf)) return -EINVAL; ntf = (struct nci_core_reset_ntf *)skb->data; ndev->nci_ver = ntf->nci_ver; pr_debug("nci_ver 0x%x, config_status 0x%x\n", ntf->nci_ver, ntf->config_status); ndev->manufact_id = ntf->manufact_id; ndev->manufact_specific_info = __le32_to_cpu(ntf->manufact_specific_info); nci_req_complete(ndev, NCI_STATUS_OK); return 0; } static int nci_core_conn_credits_ntf_packet(struct nci_dev *ndev, struct sk_buff *skb) { struct nci_core_conn_credit_ntf *ntf; struct nci_conn_info *conn_info; int i; if (skb->len < sizeof(struct nci_core_conn_credit_ntf)) return -EINVAL; ntf = (struct nci_core_conn_credit_ntf *)skb->data; pr_debug("num_entries %d\n", ntf->num_entries); if (ntf->num_entries > NCI_MAX_NUM_CONN) ntf->num_entries = NCI_MAX_NUM_CONN; /* update the credits */ for (i = 0; i < ntf->num_entries; i++) { ntf->conn_entries[i].conn_id = nci_conn_id(&ntf->conn_entries[i].conn_id); pr_debug("entry[%d]: conn_id %d, credits %d\n", i, ntf->conn_entries[i].conn_id, ntf->conn_entries[i].credits); conn_info = nci_get_conn_info_by_conn_id(ndev, ntf->conn_entries[i].conn_id); if (!conn_info) return 0; atomic_add(ntf->conn_entries[i].credits, &conn_info->credits_cnt); } /* trigger the next tx */ if (!skb_queue_empty(&ndev->tx_q)) queue_work(ndev->tx_wq, &ndev->tx_work); return 0; } static int nci_core_generic_error_ntf_packet(struct nci_dev *ndev, const struct sk_buff *skb) { __u8 status; if (skb->len < 1) return -EINVAL; status = skb->data[0]; pr_debug("status 0x%x\n", status); if (atomic_read(&ndev->state) == NCI_W4_HOST_SELECT) { /* Activation failed, so complete the request (the state remains the same) */ nci_req_complete(ndev, status); } return 0; } static int nci_core_conn_intf_error_ntf_packet(struct nci_dev *ndev, struct sk_buff *skb) { struct nci_core_intf_error_ntf *ntf; if (skb->len < sizeof(struct nci_core_intf_error_ntf)) return -EINVAL; ntf = (struct nci_core_intf_error_ntf *)skb->data; ntf->conn_id = nci_conn_id(&ntf->conn_id); pr_debug("status 0x%x, conn_id %d\n", ntf->status, ntf->conn_id); /* complete the data exchange transaction, if exists */ if (test_bit(NCI_DATA_EXCHANGE, &ndev->flags)) nci_data_exchange_complete(ndev, NULL, ntf->conn_id, -EIO); return 0; } static const __u8 * nci_extract_rf_params_nfca_passive_poll(struct nci_dev *ndev, struct rf_tech_specific_params_nfca_poll *nfca_poll, const __u8 *data) { nfca_poll->sens_res = __le16_to_cpu(*((__le16 *)data)); data += 2; nfca_poll->nfcid1_len = min_t(__u8, *data++, NFC_NFCID1_MAXSIZE); pr_debug("sens_res 0x%x, nfcid1_len %d\n", nfca_poll->sens_res, nfca_poll->nfcid1_len); memcpy(nfca_poll->nfcid1, data, nfca_poll->nfcid1_len); data += nfca_poll->nfcid1_len; nfca_poll->sel_res_len = *data++; if (nfca_poll->sel_res_len != 0) nfca_poll->sel_res = *data++; pr_debug("sel_res_len %d, sel_res 0x%x\n", nfca_poll->sel_res_len, nfca_poll->sel_res); return data; } static const __u8 * nci_extract_rf_params_nfcb_passive_poll(struct nci_dev *ndev, struct rf_tech_specific_params_nfcb_poll *nfcb_poll, const __u8 *data) { nfcb_poll->sensb_res_len = min_t(__u8, *data++, NFC_SENSB_RES_MAXSIZE); pr_debug("sensb_res_len %d\n", nfcb_poll->sensb_res_len); memcpy(nfcb_poll->sensb_res, data, nfcb_poll->sensb_res_len); data += nfcb_poll->sensb_res_len; return data; } static const __u8 * nci_extract_rf_params_nfcf_passive_poll(struct nci_dev *ndev, struct rf_tech_specific_params_nfcf_poll *nfcf_poll, const __u8 *data) { nfcf_poll->bit_rate = *data++; nfcf_poll->sensf_res_len = min_t(__u8, *data++, NFC_SENSF_RES_MAXSIZE); pr_debug("bit_rate %d, sensf_res_len %d\n", nfcf_poll->bit_rate, nfcf_poll->sensf_res_len); memcpy(nfcf_poll->sensf_res, data, nfcf_poll->sensf_res_len); data += nfcf_poll->sensf_res_len; return data; } static const __u8 * nci_extract_rf_params_nfcv_passive_poll(struct nci_dev *ndev, struct rf_tech_specific_params_nfcv_poll *nfcv_poll, const __u8 *data) { ++data; nfcv_poll->dsfid = *data++; memcpy(nfcv_poll->uid, data, NFC_ISO15693_UID_MAXSIZE); data += NFC_ISO15693_UID_MAXSIZE; return data; } static const __u8 * nci_extract_rf_params_nfcf_passive_listen(struct nci_dev *ndev, struct rf_tech_specific_params_nfcf_listen *nfcf_listen, const __u8 *data) { nfcf_listen->local_nfcid2_len = min_t(__u8, *data++, NFC_NFCID2_MAXSIZE); memcpy(nfcf_listen->local_nfcid2, data, nfcf_listen->local_nfcid2_len); data += nfcf_listen->local_nfcid2_len; return data; } static __u32 nci_get_prop_rf_protocol(struct nci_dev *ndev, __u8 rf_protocol) { if (ndev->ops->get_rfprotocol) return ndev->ops->get_rfprotocol(ndev, rf_protocol); return 0; } static int nci_add_new_protocol(struct nci_dev *ndev, struct nfc_target *target, __u8 rf_protocol, __u8 rf_tech_and_mode, const void *params) { const struct rf_tech_specific_params_nfca_poll *nfca_poll; const struct rf_tech_specific_params_nfcb_poll *nfcb_poll; const struct rf_tech_specific_params_nfcf_poll *nfcf_poll; const struct rf_tech_specific_params_nfcv_poll *nfcv_poll; __u32 protocol; if (rf_protocol == NCI_RF_PROTOCOL_T1T) protocol = NFC_PROTO_JEWEL_MASK; else if (rf_protocol == NCI_RF_PROTOCOL_T2T) protocol = NFC_PROTO_MIFARE_MASK; else if (rf_protocol == NCI_RF_PROTOCOL_ISO_DEP) if (rf_tech_and_mode == NCI_NFC_A_PASSIVE_POLL_MODE) protocol = NFC_PROTO_ISO14443_MASK; else protocol = NFC_PROTO_ISO14443_B_MASK; else if (rf_protocol == NCI_RF_PROTOCOL_T3T) protocol = NFC_PROTO_FELICA_MASK; else if (rf_protocol == NCI_RF_PROTOCOL_NFC_DEP) protocol = NFC_PROTO_NFC_DEP_MASK; else if (rf_protocol == NCI_RF_PROTOCOL_T5T) protocol = NFC_PROTO_ISO15693_MASK; else protocol = nci_get_prop_rf_protocol(ndev, rf_protocol); if (!(protocol & ndev->poll_prots)) { pr_err("the target found does not have the desired protocol\n"); return -EPROTO; } if (rf_tech_and_mode == NCI_NFC_A_PASSIVE_POLL_MODE) { nfca_poll = (struct rf_tech_specific_params_nfca_poll *)params; target->sens_res = nfca_poll->sens_res; target->sel_res = nfca_poll->sel_res; target->nfcid1_len = nfca_poll->nfcid1_len; if (target->nfcid1_len > ARRAY_SIZE(target->nfcid1)) return -EPROTO; if (target->nfcid1_len > 0) { memcpy(target->nfcid1, nfca_poll->nfcid1, target->nfcid1_len); } } else if (rf_tech_and_mode == NCI_NFC_B_PASSIVE_POLL_MODE) { nfcb_poll = (struct rf_tech_specific_params_nfcb_poll *)params; target->sensb_res_len = nfcb_poll->sensb_res_len; if (target->sensb_res_len > ARRAY_SIZE(target->sensb_res)) return -EPROTO; if (target->sensb_res_len > 0) { memcpy(target->sensb_res, nfcb_poll->sensb_res, target->sensb_res_len); } } else if (rf_tech_and_mode == NCI_NFC_F_PASSIVE_POLL_MODE) { nfcf_poll = (struct rf_tech_specific_params_nfcf_poll *)params; target->sensf_res_len = nfcf_poll->sensf_res_len; if (target->sensf_res_len > ARRAY_SIZE(target->sensf_res)) return -EPROTO; if (target->sensf_res_len > 0) { memcpy(target->sensf_res, nfcf_poll->sensf_res, target->sensf_res_len); } } else if (rf_tech_and_mode == NCI_NFC_V_PASSIVE_POLL_MODE) { nfcv_poll = (struct rf_tech_specific_params_nfcv_poll *)params; target->is_iso15693 = 1; target->iso15693_dsfid = nfcv_poll->dsfid; memcpy(target->iso15693_uid, nfcv_poll->uid, NFC_ISO15693_UID_MAXSIZE); } else { pr_err("unsupported rf_tech_and_mode 0x%x\n", rf_tech_and_mode); return -EPROTO; } target->supported_protocols |= protocol; pr_debug("protocol 0x%x\n", protocol); return 0; } static void nci_add_new_target(struct nci_dev *ndev, const struct nci_rf_discover_ntf *ntf) { struct nfc_target *target; int i, rc; for (i = 0; i < ndev->n_targets; i++) { target = &ndev->targets[i]; if (target->logical_idx == ntf->rf_discovery_id) { /* This target already exists, add the new protocol */ nci_add_new_protocol(ndev, target, ntf->rf_protocol, ntf->rf_tech_and_mode, &ntf->rf_tech_specific_params); return; } } /* This is a new target, check if we've enough room */ if (ndev->n_targets == NCI_MAX_DISCOVERED_TARGETS) { pr_debug("not enough room, ignoring new target...\n"); return; } target = &ndev->targets[ndev->n_targets]; rc = nci_add_new_protocol(ndev, target, ntf->rf_protocol, ntf->rf_tech_and_mode, &ntf->rf_tech_specific_params); if (!rc) { target->logical_idx = ntf->rf_discovery_id; ndev->n_targets++; pr_debug("logical idx %d, n_targets %d\n", target->logical_idx, ndev->n_targets); } } void nci_clear_target_list(struct nci_dev *ndev) { memset(ndev->targets, 0, (sizeof(struct nfc_target)*NCI_MAX_DISCOVERED_TARGETS)); ndev->n_targets = 0; } static int nci_rf_discover_ntf_packet(struct nci_dev *ndev, const struct sk_buff *skb) { struct nci_rf_discover_ntf ntf; const __u8 *data; bool add_target = true; if (skb->len < sizeof(struct nci_rf_discover_ntf)) return -EINVAL; data = skb->data; ntf.rf_discovery_id = *data++; ntf.rf_protocol = *data++; ntf.rf_tech_and_mode = *data++; ntf.rf_tech_specific_params_len = *data++; pr_debug("rf_discovery_id %d\n", ntf.rf_discovery_id); pr_debug("rf_protocol 0x%x\n", ntf.rf_protocol); pr_debug("rf_tech_and_mode 0x%x\n", ntf.rf_tech_and_mode); pr_debug("rf_tech_specific_params_len %d\n", ntf.rf_tech_specific_params_len); if (ntf.rf_tech_specific_params_len > 0) { switch (ntf.rf_tech_and_mode) { case NCI_NFC_A_PASSIVE_POLL_MODE: data = nci_extract_rf_params_nfca_passive_poll(ndev, &(ntf.rf_tech_specific_params.nfca_poll), data); break; case NCI_NFC_B_PASSIVE_POLL_MODE: data = nci_extract_rf_params_nfcb_passive_poll(ndev, &(ntf.rf_tech_specific_params.nfcb_poll), data); break; case NCI_NFC_F_PASSIVE_POLL_MODE: data = nci_extract_rf_params_nfcf_passive_poll(ndev, &(ntf.rf_tech_specific_params.nfcf_poll), data); break; case NCI_NFC_V_PASSIVE_POLL_MODE: data = nci_extract_rf_params_nfcv_passive_poll(ndev, &(ntf.rf_tech_specific_params.nfcv_poll), data); break; default: pr_err("unsupported rf_tech_and_mode 0x%x\n", ntf.rf_tech_and_mode); data += ntf.rf_tech_specific_params_len; add_target = false; } } ntf.ntf_type = *data++; pr_debug("ntf_type %d\n", ntf.ntf_type); if (add_target == true) nci_add_new_target(ndev, &ntf); if (ntf.ntf_type == NCI_DISCOVER_NTF_TYPE_MORE) { atomic_set(&ndev->state, NCI_W4_ALL_DISCOVERIES); } else { atomic_set(&ndev->state, NCI_W4_HOST_SELECT); nfc_targets_found(ndev->nfc_dev, ndev->targets, ndev->n_targets); } return 0; } static int nci_extract_activation_params_iso_dep(struct nci_dev *ndev, struct nci_rf_intf_activated_ntf *ntf, const __u8 *data) { struct activation_params_nfca_poll_iso_dep *nfca_poll; struct activation_params_nfcb_poll_iso_dep *nfcb_poll; switch (ntf->activation_rf_tech_and_mode) { case NCI_NFC_A_PASSIVE_POLL_MODE: nfca_poll = &ntf->activation_params.nfca_poll_iso_dep; nfca_poll->rats_res_len = min_t(__u8, *data++, NFC_ATS_MAXSIZE); pr_debug("rats_res_len %d\n", nfca_poll->rats_res_len); if (nfca_poll->rats_res_len > 0) { memcpy(nfca_poll->rats_res, data, nfca_poll->rats_res_len); } break; case NCI_NFC_B_PASSIVE_POLL_MODE: nfcb_poll = &ntf->activation_params.nfcb_poll_iso_dep; nfcb_poll->attrib_res_len = min_t(__u8, *data++, 50); pr_debug("attrib_res_len %d\n", nfcb_poll->attrib_res_len); if (nfcb_poll->attrib_res_len > 0) { memcpy(nfcb_poll->attrib_res, data, nfcb_poll->attrib_res_len); } break; default: pr_err("unsupported activation_rf_tech_and_mode 0x%x\n", ntf->activation_rf_tech_and_mode); return NCI_STATUS_RF_PROTOCOL_ERROR; } return NCI_STATUS_OK; } static int nci_extract_activation_params_nfc_dep(struct nci_dev *ndev, struct nci_rf_intf_activated_ntf *ntf, const __u8 *data) { struct activation_params_poll_nfc_dep *poll; struct activation_params_listen_nfc_dep *listen; switch (ntf->activation_rf_tech_and_mode) { case NCI_NFC_A_PASSIVE_POLL_MODE: case NCI_NFC_F_PASSIVE_POLL_MODE: poll = &ntf->activation_params.poll_nfc_dep; poll->atr_res_len = min_t(__u8, *data++, NFC_ATR_RES_MAXSIZE - 2); pr_debug("atr_res_len %d\n", poll->atr_res_len); if (poll->atr_res_len > 0) memcpy(poll->atr_res, data, poll->atr_res_len); break; case NCI_NFC_A_PASSIVE_LISTEN_MODE: case NCI_NFC_F_PASSIVE_LISTEN_MODE: listen = &ntf->activation_params.listen_nfc_dep; listen->atr_req_len = min_t(__u8, *data++, NFC_ATR_REQ_MAXSIZE - 2); pr_debug("atr_req_len %d\n", listen->atr_req_len); if (listen->atr_req_len > 0) memcpy(listen->atr_req, data, listen->atr_req_len); break; default: pr_err("unsupported activation_rf_tech_and_mode 0x%x\n", ntf->activation_rf_tech_and_mode); return NCI_STATUS_RF_PROTOCOL_ERROR; } return NCI_STATUS_OK; } static void nci_target_auto_activated(struct nci_dev *ndev, const struct nci_rf_intf_activated_ntf *ntf) { struct nfc_target *target; int rc; target = &ndev->targets[ndev->n_targets]; rc = nci_add_new_protocol(ndev, target, ntf->rf_protocol, ntf->activation_rf_tech_and_mode, &ntf->rf_tech_specific_params); if (rc) return; target->logical_idx = ntf->rf_discovery_id; ndev->n_targets++; pr_debug("logical idx %d, n_targets %d\n", target->logical_idx, ndev->n_targets); nfc_targets_found(ndev->nfc_dev, ndev->targets, ndev->n_targets); } static int nci_store_general_bytes_nfc_dep(struct nci_dev *ndev, const struct nci_rf_intf_activated_ntf *ntf) { ndev->remote_gb_len = 0; if (ntf->activation_params_len <= 0) return NCI_STATUS_OK; switch (ntf->activation_rf_tech_and_mode) { case NCI_NFC_A_PASSIVE_POLL_MODE: case NCI_NFC_F_PASSIVE_POLL_MODE: ndev->remote_gb_len = min_t(__u8, (ntf->activation_params.poll_nfc_dep.atr_res_len - NFC_ATR_RES_GT_OFFSET), NFC_ATR_RES_GB_MAXSIZE); memcpy(ndev->remote_gb, (ntf->activation_params.poll_nfc_dep.atr_res + NFC_ATR_RES_GT_OFFSET), ndev->remote_gb_len); break; case NCI_NFC_A_PASSIVE_LISTEN_MODE: case NCI_NFC_F_PASSIVE_LISTEN_MODE: ndev->remote_gb_len = min_t(__u8, (ntf->activation_params.listen_nfc_dep.atr_req_len - NFC_ATR_REQ_GT_OFFSET), NFC_ATR_REQ_GB_MAXSIZE); memcpy(ndev->remote_gb, (ntf->activation_params.listen_nfc_dep.atr_req + NFC_ATR_REQ_GT_OFFSET), ndev->remote_gb_len); break; default: pr_err("unsupported activation_rf_tech_and_mode 0x%x\n", ntf->activation_rf_tech_and_mode); return NCI_STATUS_RF_PROTOCOL_ERROR; } return NCI_STATUS_OK; } static int nci_store_ats_nfc_iso_dep(struct nci_dev *ndev, const struct nci_rf_intf_activated_ntf *ntf) { ndev->target_ats_len = 0; if (ntf->activation_params_len <= 0) return NCI_STATUS_OK; if (ntf->activation_params.nfca_poll_iso_dep.rats_res_len > NFC_ATS_MAXSIZE) { pr_debug("ATS too long\n"); return NCI_STATUS_RF_PROTOCOL_ERROR; } if (ntf->activation_params.nfca_poll_iso_dep.rats_res_len > 0) { ndev->target_ats_len = ntf->activation_params.nfca_poll_iso_dep.rats_res_len; memcpy(ndev->target_ats, ntf->activation_params.nfca_poll_iso_dep.rats_res, ndev->target_ats_len); } return NCI_STATUS_OK; } static int nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev, const struct sk_buff *skb) { struct nci_conn_info *conn_info; struct nci_rf_intf_activated_ntf ntf; const __u8 *data; int err = NCI_STATUS_OK; if (skb->len < sizeof(struct nci_rf_intf_activated_ntf)) return -EINVAL; data = skb->data; ntf.rf_discovery_id = *data++; ntf.rf_interface = *data++; ntf.rf_protocol = *data++; ntf.activation_rf_tech_and_mode = *data++; ntf.max_data_pkt_payload_size = *data++; ntf.initial_num_credits = *data++; ntf.rf_tech_specific_params_len = *data++; pr_debug("rf_discovery_id %d\n", ntf.rf_discovery_id); pr_debug("rf_interface 0x%x\n", ntf.rf_interface); pr_debug("rf_protocol 0x%x\n", ntf.rf_protocol); pr_debug("activation_rf_tech_and_mode 0x%x\n", ntf.activation_rf_tech_and_mode); pr_debug("max_data_pkt_payload_size 0x%x\n", ntf.max_data_pkt_payload_size); pr_debug("initial_num_credits 0x%x\n", ntf.initial_num_credits); pr_debug("rf_tech_specific_params_len %d\n", ntf.rf_tech_specific_params_len); /* If this contains a value of 0x00 (NFCEE Direct RF * Interface) then all following parameters SHALL contain a * value of 0 and SHALL be ignored. */ if (ntf.rf_interface == NCI_RF_INTERFACE_NFCEE_DIRECT) goto listen; if (ntf.rf_tech_specific_params_len > 0) { switch (ntf.activation_rf_tech_and_mode) { case NCI_NFC_A_PASSIVE_POLL_MODE: data = nci_extract_rf_params_nfca_passive_poll(ndev, &(ntf.rf_tech_specific_params.nfca_poll), data); break; case NCI_NFC_B_PASSIVE_POLL_MODE: data = nci_extract_rf_params_nfcb_passive_poll(ndev, &(ntf.rf_tech_specific_params.nfcb_poll), data); break; case NCI_NFC_F_PASSIVE_POLL_MODE: data = nci_extract_rf_params_nfcf_passive_poll(ndev, &(ntf.rf_tech_specific_params.nfcf_poll), data); break; case NCI_NFC_V_PASSIVE_POLL_MODE: data = nci_extract_rf_params_nfcv_passive_poll(ndev, &(ntf.rf_tech_specific_params.nfcv_poll), data); break; case NCI_NFC_A_PASSIVE_LISTEN_MODE: /* no RF technology specific parameters */ break; case NCI_NFC_F_PASSIVE_LISTEN_MODE: data = nci_extract_rf_params_nfcf_passive_listen(ndev, &(ntf.rf_tech_specific_params.nfcf_listen), data); break; default: pr_err("unsupported activation_rf_tech_and_mode 0x%x\n", ntf.activation_rf_tech_and_mode); err = NCI_STATUS_RF_PROTOCOL_ERROR; goto exit; } } ntf.data_exch_rf_tech_and_mode = *data++; ntf.data_exch_tx_bit_rate = *data++; ntf.data_exch_rx_bit_rate = *data++; ntf.activation_params_len = *data++; pr_debug("data_exch_rf_tech_and_mode 0x%x\n", ntf.data_exch_rf_tech_and_mode); pr_debug("data_exch_tx_bit_rate 0x%x\n", ntf.data_exch_tx_bit_rate); pr_debug("data_exch_rx_bit_rate 0x%x\n", ntf.data_exch_rx_bit_rate); pr_debug("activation_params_len %d\n", ntf.activation_params_len); if (ntf.activation_params_len > 0) { switch (ntf.rf_interface) { case NCI_RF_INTERFACE_ISO_DEP: err = nci_extract_activation_params_iso_dep(ndev, &ntf, data); break; case NCI_RF_INTERFACE_NFC_DEP: err = nci_extract_activation_params_nfc_dep(ndev, &ntf, data); break; case NCI_RF_INTERFACE_FRAME: /* no activation params */ break; default: pr_err("unsupported rf_interface 0x%x\n", ntf.rf_interface); err = NCI_STATUS_RF_PROTOCOL_ERROR; break; } } exit: if (err == NCI_STATUS_OK) { conn_info = ndev->rf_conn_info; if (!conn_info) return 0; conn_info->max_pkt_payload_len = ntf.max_data_pkt_payload_size; conn_info->initial_num_credits = ntf.initial_num_credits; /* set the available credits to initial value */ atomic_set(&conn_info->credits_cnt, conn_info->initial_num_credits); /* store general bytes to be reported later in dep_link_up */ if (ntf.rf_interface == NCI_RF_INTERFACE_NFC_DEP) { err = nci_store_general_bytes_nfc_dep(ndev, &ntf); if (err != NCI_STATUS_OK) pr_err("unable to store general bytes\n"); } /* store ATS to be reported later in nci_activate_target */ if (ntf.rf_interface == NCI_RF_INTERFACE_ISO_DEP && ntf.activation_rf_tech_and_mode == NCI_NFC_A_PASSIVE_POLL_MODE) { err = nci_store_ats_nfc_iso_dep(ndev, &ntf); if (err != NCI_STATUS_OK) pr_err("unable to store ATS\n"); } } if (!(ntf.activation_rf_tech_and_mode & NCI_RF_TECH_MODE_LISTEN_MASK)) { /* Poll mode */ if (atomic_read(&ndev->state) == NCI_DISCOVERY) { /* A single target was found and activated * automatically */ atomic_set(&ndev->state, NCI_POLL_ACTIVE); if (err == NCI_STATUS_OK) nci_target_auto_activated(ndev, &ntf); } else { /* ndev->state == NCI_W4_HOST_SELECT */ /* A selected target was activated, so complete the * request */ atomic_set(&ndev->state, NCI_POLL_ACTIVE); nci_req_complete(ndev, err); } } else { listen: /* Listen mode */ atomic_set(&ndev->state, NCI_LISTEN_ACTIVE); if (err == NCI_STATUS_OK && ntf.rf_protocol == NCI_RF_PROTOCOL_NFC_DEP) { err = nfc_tm_activated(ndev->nfc_dev, NFC_PROTO_NFC_DEP_MASK, NFC_COMM_PASSIVE, ndev->remote_gb, ndev->remote_gb_len); if (err != NCI_STATUS_OK) pr_err("error when signaling tm activation\n"); } } return 0; } static int nci_rf_deactivate_ntf_packet(struct nci_dev *ndev, const struct sk_buff *skb) { const struct nci_conn_info *conn_info; const struct nci_rf_deactivate_ntf *ntf; if (skb->len < sizeof(struct nci_rf_deactivate_ntf)) return -EINVAL; ntf = (struct nci_rf_deactivate_ntf *)skb->data; pr_debug("entry, type 0x%x, reason 0x%x\n", ntf->type, ntf->reason); conn_info = ndev->rf_conn_info; if (!conn_info) return 0; /* drop tx data queue */ skb_queue_purge(&ndev->tx_q); /* drop partial rx data packet */ if (ndev->rx_data_reassembly) { kfree_skb(ndev->rx_data_reassembly); ndev->rx_data_reassembly = NULL; } /* complete the data exchange transaction, if exists */ if (test_bit(NCI_DATA_EXCHANGE, &ndev->flags)) nci_data_exchange_complete(ndev, NULL, NCI_STATIC_RF_CONN_ID, -EIO); switch (ntf->type) { case NCI_DEACTIVATE_TYPE_IDLE_MODE: nci_clear_target_list(ndev); atomic_set(&ndev->state, NCI_IDLE); break; case NCI_DEACTIVATE_TYPE_SLEEP_MODE: case NCI_DEACTIVATE_TYPE_SLEEP_AF_MODE: atomic_set(&ndev->state, NCI_W4_HOST_SELECT); break; case NCI_DEACTIVATE_TYPE_DISCOVERY: nci_clear_target_list(ndev); atomic_set(&ndev->state, NCI_DISCOVERY); break; } nci_req_complete(ndev, NCI_STATUS_OK); return 0; } static int nci_nfcee_discover_ntf_packet(struct nci_dev *ndev, const struct sk_buff *skb) { u8 status = NCI_STATUS_OK; const struct nci_nfcee_discover_ntf *nfcee_ntf; if (skb->len < sizeof(struct nci_nfcee_discover_ntf)) return -EINVAL; nfcee_ntf = (struct nci_nfcee_discover_ntf *)skb->data; /* NFCForum NCI 9.2.1 HCI Network Specific Handling * If the NFCC supports the HCI Network, it SHALL return one, * and only one, NFCEE_DISCOVER_NTF with a Protocol type of * “HCI Access”, even if the HCI Network contains multiple NFCEEs. */ ndev->hci_dev->nfcee_id = nfcee_ntf->nfcee_id; ndev->cur_params.id = nfcee_ntf->nfcee_id; nci_req_complete(ndev, status); return 0; } void nci_ntf_packet(struct nci_dev *ndev, struct sk_buff *skb) { __u16 ntf_opcode = nci_opcode(skb->data); pr_debug("NCI RX: MT=ntf, PBF=%d, GID=0x%x, OID=0x%x, plen=%d\n", nci_pbf(skb->data), nci_opcode_gid(ntf_opcode), nci_opcode_oid(ntf_opcode), nci_plen(skb->data)); /* strip the nci control header */ skb_pull(skb, NCI_CTRL_HDR_SIZE); if (nci_opcode_gid(ntf_opcode) == NCI_GID_PROPRIETARY) { if (nci_prop_ntf_packet(ndev, ntf_opcode, skb) == -ENOTSUPP) { pr_err("unsupported ntf opcode 0x%x\n", ntf_opcode); } goto end; } switch (ntf_opcode) { case NCI_OP_CORE_RESET_NTF: if (nci_core_reset_ntf_packet(ndev, skb)) goto end; break; case NCI_OP_CORE_CONN_CREDITS_NTF: if (nci_core_conn_credits_ntf_packet(ndev, skb)) goto end; break; case NCI_OP_CORE_GENERIC_ERROR_NTF: if (nci_core_generic_error_ntf_packet(ndev, skb)) goto end; break; case NCI_OP_CORE_INTF_ERROR_NTF: if (nci_core_conn_intf_error_ntf_packet(ndev, skb)) goto end; break; case NCI_OP_RF_DISCOVER_NTF: if (nci_rf_discover_ntf_packet(ndev, skb)) goto end; break; case NCI_OP_RF_INTF_ACTIVATED_NTF: if (nci_rf_intf_activated_ntf_packet(ndev, skb)) goto end; break; case NCI_OP_RF_DEACTIVATE_NTF: if (nci_rf_deactivate_ntf_packet(ndev, skb)) goto end; break; case NCI_OP_NFCEE_DISCOVER_NTF: if (nci_nfcee_discover_ntf_packet(ndev, skb)) goto end; break; case NCI_OP_RF_NFCEE_ACTION_NTF: break; default: pr_err("unknown ntf opcode 0x%x\n", ntf_opcode); break; } nci_core_ntf_packet(ndev, ntf_opcode, skb); end: kfree_skb(skb); }
2 7 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 /* * net/tipc/bcast.h: Include file for TIPC broadcast code * * Copyright (c) 2003-2006, 2014-2015, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _TIPC_BCAST_H #define _TIPC_BCAST_H #include "core.h" struct tipc_node; struct tipc_msg; struct tipc_nl_msg; struct tipc_nlist; struct tipc_nitem; extern const char tipc_bclink_name[]; extern unsigned long sysctl_tipc_bc_retruni; #define TIPC_METHOD_EXPIRE msecs_to_jiffies(5000) #define BCLINK_MODE_BCAST 0x1 #define BCLINK_MODE_RCAST 0x2 #define BCLINK_MODE_SEL 0x4 struct tipc_nlist { struct list_head list; u32 self; u16 remote; bool local; }; void tipc_nlist_init(struct tipc_nlist *nl, u32 self); void tipc_nlist_purge(struct tipc_nlist *nl); void tipc_nlist_add(struct tipc_nlist *nl, u32 node); void tipc_nlist_del(struct tipc_nlist *nl, u32 node); /* Cookie to be used between socket and broadcast layer * @rcast: replicast (instead of broadcast) was used at previous xmit * @mandatory: broadcast/replicast indication was set by user * @deferredq: defer queue to make message in order * @expires: re-evaluate non-mandatory transmit method if we are past this */ struct tipc_mc_method { bool rcast; bool mandatory; struct sk_buff_head deferredq; unsigned long expires; }; int tipc_bcast_init(struct net *net); void tipc_bcast_stop(struct net *net); void tipc_bcast_add_peer(struct net *net, struct tipc_link *l, struct sk_buff_head *xmitq); void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_bcl); void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id); void tipc_bcast_dec_bearer_dst_cnt(struct net *net, int bearer_id); int tipc_bcast_get_mtu(struct net *net); void tipc_bcast_toggle_rcast(struct net *net, bool supp); int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts, struct tipc_mc_method *method, struct tipc_nlist *dests, u16 *cong_link_cnt); int tipc_bcast_xmit(struct net *net, struct sk_buff_head *pkts, u16 *cong_link_cnt); int tipc_bcast_rcv(struct net *net, struct tipc_link *l, struct sk_buff *skb); void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l, struct tipc_msg *hdr); int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l, struct tipc_msg *hdr, struct sk_buff_head *retrq); int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg, struct tipc_link *bcl); int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]); int tipc_bclink_reset_stats(struct net *net, struct tipc_link *l); u32 tipc_bcast_get_mode(struct net *net); u32 tipc_bcast_get_broadcast_ratio(struct net *net); void tipc_mcast_filter_msg(struct net *net, struct sk_buff_head *defq, struct sk_buff_head *inputq); static inline void tipc_bcast_lock(struct net *net) { spin_lock_bh(&tipc_net(net)->bclock); } static inline void tipc_bcast_unlock(struct net *net) { spin_unlock_bh(&tipc_net(net)->bclock); } static inline struct tipc_link *tipc_bc_sndlink(struct net *net) { return tipc_net(net)->bcl; } #endif
40 40 1 9 108 4 100 2 6 62 40 40 40 40 6 8 2 11 890 887 16 879 886 883 888 398 890 63 887 887 770 67 68 8 8 8 8 8 8 8 53 61 60 60 58 57 54 54 53 51 51 50 48 1 47 46 44 5 43 43 1 43 15 8 1 2 2 2 165 128 2 35 3 134 2 27 6 133 23 157 144 12 145 12 142 2 13 151 5 151 4 148 9 154 151 5 147 7 1 8 146 2 2 6 2 140 138 11 130 6 6 9 3 142 1 131 5 5 2 4 135 1 135 23 112 132 2 122 4 8 121 3 9 43 42 27 20 8 1 2 1 1 1 1 2 1 2 1 1 4 103 2 57 2 2 17 78 1 79 34 9 41 3 44 46 2 2 1 46 6 41 7 43 3 46 46 35 103 78 2 34 1 1 13 62 1 61 2 2 26 28 26 1 1 26 1 1 1 28 28 35 79 6 62 72 69 4 67 3 3 66 5 3 16 65 68 5 73 6 66 8 70 3 68 4 71 2 65 8 70 2 69 4 69 6 70 68 4 73 5 5 5 2 2 4 4 8 2 2 18 14 4 2 3 3 2 2 2 67 68 68 1178 1174 1177 566 566 564 1968 1968 1948 1148 1147 32 32 32 535 535 40 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 // SPDX-License-Identifier: GPL-2.0-only /* * net/core/fib_rules.c Generic Routing Rules * * Authors: Thomas Graf <tgraf@suug.ch> */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/module.h> #include <net/net_namespace.h> #include <net/inet_dscp.h> #include <net/sock.h> #include <net/fib_rules.h> #include <net/ip_tunnels.h> #include <linux/indirect_call_wrapper.h> #if defined(CONFIG_IPV6) && defined(CONFIG_IPV6_MULTIPLE_TABLES) #ifdef CONFIG_IP_MULTIPLE_TABLES #define INDIRECT_CALL_MT(f, f2, f1, ...) \ INDIRECT_CALL_INET(f, f2, f1, __VA_ARGS__) #else #define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f2, __VA_ARGS__) #endif #elif defined(CONFIG_IP_MULTIPLE_TABLES) #define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__) #else #define INDIRECT_CALL_MT(f, f2, f1, ...) f(__VA_ARGS__) #endif static const struct fib_kuid_range fib_kuid_range_unset = { KUIDT_INIT(0), KUIDT_INIT(~0), }; bool fib_rule_matchall(const struct fib_rule *rule) { if (READ_ONCE(rule->iifindex) || READ_ONCE(rule->oifindex) || rule->mark || rule->tun_id || rule->flags) return false; if (rule->suppress_ifgroup != -1 || rule->suppress_prefixlen != -1) return false; if (!uid_eq(rule->uid_range.start, fib_kuid_range_unset.start) || !uid_eq(rule->uid_range.end, fib_kuid_range_unset.end)) return false; if (fib_rule_port_range_set(&rule->sport_range)) return false; if (fib_rule_port_range_set(&rule->dport_range)) return false; return true; } EXPORT_SYMBOL_GPL(fib_rule_matchall); int fib_default_rule_add(struct fib_rules_ops *ops, u32 pref, u32 table) { struct fib_rule *r; r = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT); if (r == NULL) return -ENOMEM; refcount_set(&r->refcnt, 1); r->action = FR_ACT_TO_TBL; r->pref = pref; r->table = table; r->proto = RTPROT_KERNEL; r->fr_net = ops->fro_net; r->uid_range = fib_kuid_range_unset; r->suppress_prefixlen = -1; r->suppress_ifgroup = -1; /* The lock is not required here, the list in unreachable * at the moment this function is called */ list_add_tail(&r->list, &ops->rules_list); return 0; } EXPORT_SYMBOL(fib_default_rule_add); static u32 fib_default_rule_pref(struct fib_rules_ops *ops) { struct list_head *pos; struct fib_rule *rule; if (!list_empty(&ops->rules_list)) { pos = ops->rules_list.next; if (pos->next != &ops->rules_list) { rule = list_entry(pos->next, struct fib_rule, list); if (rule->pref) return rule->pref - 1; } } return 0; } static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, u32 pid); static struct fib_rules_ops *lookup_rules_ops(const struct net *net, int family) { struct fib_rules_ops *ops; rcu_read_lock(); list_for_each_entry_rcu(ops, &net->rules_ops, list) { if (ops->family == family) { if (!try_module_get(ops->owner)) ops = NULL; rcu_read_unlock(); return ops; } } rcu_read_unlock(); return NULL; } static void rules_ops_put(struct fib_rules_ops *ops) { if (ops) module_put(ops->owner); } static void flush_route_cache(struct fib_rules_ops *ops) { if (ops->flush_cache) ops->flush_cache(ops); } static int __fib_rules_register(struct fib_rules_ops *ops) { int err = -EEXIST; struct fib_rules_ops *o; struct net *net; net = ops->fro_net; if (ops->rule_size < sizeof(struct fib_rule)) return -EINVAL; if (ops->match == NULL || ops->configure == NULL || ops->compare == NULL || ops->fill == NULL || ops->action == NULL) return -EINVAL; spin_lock(&net->rules_mod_lock); list_for_each_entry(o, &net->rules_ops, list) if (ops->family == o->family) goto errout; list_add_tail_rcu(&ops->list, &net->rules_ops); err = 0; errout: spin_unlock(&net->rules_mod_lock); return err; } struct fib_rules_ops * fib_rules_register(const struct fib_rules_ops *tmpl, struct net *net) { struct fib_rules_ops *ops; int err; ops = kmemdup(tmpl, sizeof(*ops), GFP_KERNEL); if (ops == NULL) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&ops->rules_list); ops->fro_net = net; err = __fib_rules_register(ops); if (err) { kfree(ops); ops = ERR_PTR(err); } return ops; } EXPORT_SYMBOL_GPL(fib_rules_register); static void fib_rules_cleanup_ops(struct fib_rules_ops *ops) { struct fib_rule *rule, *tmp; list_for_each_entry_safe(rule, tmp, &ops->rules_list, list) { list_del_rcu(&rule->list); if (ops->delete) ops->delete(rule); fib_rule_put(rule); } } void fib_rules_unregister(struct fib_rules_ops *ops) { struct net *net = ops->fro_net; spin_lock(&net->rules_mod_lock); list_del_rcu(&ops->list); spin_unlock(&net->rules_mod_lock); fib_rules_cleanup_ops(ops); kfree_rcu(ops, rcu); } EXPORT_SYMBOL_GPL(fib_rules_unregister); static int uid_range_set(struct fib_kuid_range *range) { return uid_valid(range->start) && uid_valid(range->end); } static struct fib_kuid_range nla_get_kuid_range(struct nlattr **tb) { struct fib_rule_uid_range *in; struct fib_kuid_range out; in = (struct fib_rule_uid_range *)nla_data(tb[FRA_UID_RANGE]); out.start = make_kuid(current_user_ns(), in->start); out.end = make_kuid(current_user_ns(), in->end); return out; } static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range) { struct fib_rule_uid_range out = { from_kuid_munged(current_user_ns(), range->start), from_kuid_munged(current_user_ns(), range->end) }; return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out); } static int nla_get_port_range(struct nlattr *pattr, struct fib_rule_port_range *port_range) { const struct fib_rule_port_range *pr = nla_data(pattr); if (!fib_rule_port_range_valid(pr)) return -EINVAL; port_range->start = pr->start; port_range->end = pr->end; return 0; } static int nla_put_port_range(struct sk_buff *skb, int attrtype, struct fib_rule_port_range *range) { return nla_put(skb, attrtype, sizeof(*range), range); } static bool fib_rule_iif_match(const struct fib_rule *rule, int iifindex, const struct flowi *fl) { u8 iif_is_l3_master = READ_ONCE(rule->iif_is_l3_master); return iif_is_l3_master ? l3mdev_fib_rule_iif_match(fl, iifindex) : fl->flowi_iif == iifindex; } static bool fib_rule_oif_match(const struct fib_rule *rule, int oifindex, const struct flowi *fl) { u8 oif_is_l3_master = READ_ONCE(rule->oif_is_l3_master); return oif_is_l3_master ? l3mdev_fib_rule_oif_match(fl, oifindex) : fl->flowi_oif == oifindex; } static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, struct flowi *fl, int flags, struct fib_lookup_arg *arg) { int iifindex, oifindex, ret = 0; iifindex = READ_ONCE(rule->iifindex); if (iifindex && !fib_rule_iif_match(rule, iifindex, fl)) goto out; oifindex = READ_ONCE(rule->oifindex); if (oifindex && !fib_rule_oif_match(rule, oifindex, fl)) goto out; if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask) goto out; if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id)) goto out; if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg)) goto out; if (uid_lt(fl->flowi_uid, rule->uid_range.start) || uid_gt(fl->flowi_uid, rule->uid_range.end)) goto out; ret = INDIRECT_CALL_MT(ops->match, fib6_rule_match, fib4_rule_match, rule, fl, flags); out: return (rule->flags & FIB_RULE_INVERT) ? !ret : ret; } int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl, int flags, struct fib_lookup_arg *arg) { struct fib_rule *rule; int err; rcu_read_lock(); list_for_each_entry_rcu(rule, &ops->rules_list, list) { jumped: if (!fib_rule_match(rule, ops, fl, flags, arg)) continue; if (rule->action == FR_ACT_GOTO) { struct fib_rule *target; target = rcu_dereference(rule->ctarget); if (target == NULL) { continue; } else { rule = target; goto jumped; } } else if (rule->action == FR_ACT_NOP) continue; else err = INDIRECT_CALL_MT(ops->action, fib6_rule_action, fib4_rule_action, rule, fl, flags, arg); if (!err && ops->suppress && INDIRECT_CALL_MT(ops->suppress, fib6_rule_suppress, fib4_rule_suppress, rule, flags, arg)) continue; if (err != -EAGAIN) { if ((arg->flags & FIB_LOOKUP_NOREF) || likely(refcount_inc_not_zero(&rule->refcnt))) { arg->rule = rule; goto out; } break; } } err = -ESRCH; out: rcu_read_unlock(); return err; } EXPORT_SYMBOL_GPL(fib_rules_lookup); static int call_fib_rule_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_rule *rule, int family, struct netlink_ext_ack *extack) { struct fib_rule_notifier_info info = { .info.family = family, .info.extack = extack, .rule = rule, }; return call_fib_notifier(nb, event_type, &info.info); } static int call_fib_rule_notifiers(struct net *net, enum fib_event_type event_type, struct fib_rule *rule, struct fib_rules_ops *ops, struct netlink_ext_ack *extack) { struct fib_rule_notifier_info info = { .info.family = ops->family, .info.extack = extack, .rule = rule, }; ASSERT_RTNL_NET(net); /* Paired with READ_ONCE() in fib_rules_seq() */ WRITE_ONCE(ops->fib_rules_seq, ops->fib_rules_seq + 1); return call_fib_notifiers(net, event_type, &info.info); } /* Called with rcu_read_lock() */ int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, struct netlink_ext_ack *extack) { struct fib_rules_ops *ops; struct fib_rule *rule; int err = 0; ops = lookup_rules_ops(net, family); if (!ops) return -EAFNOSUPPORT; list_for_each_entry_rcu(rule, &ops->rules_list, list) { err = call_fib_rule_notifier(nb, FIB_EVENT_RULE_ADD, rule, family, extack); if (err) break; } rules_ops_put(ops); return err; } EXPORT_SYMBOL_GPL(fib_rules_dump); unsigned int fib_rules_seq_read(const struct net *net, int family) { unsigned int fib_rules_seq; struct fib_rules_ops *ops; ops = lookup_rules_ops(net, family); if (!ops) return 0; /* Paired with WRITE_ONCE() in call_fib_rule_notifiers() */ fib_rules_seq = READ_ONCE(ops->fib_rules_seq); rules_ops_put(ops); return fib_rules_seq; } EXPORT_SYMBOL_GPL(fib_rules_seq_read); static struct fib_rule *rule_find(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, struct nlattr **tb, struct fib_rule *rule, bool user_priority) { struct fib_rule *r; list_for_each_entry(r, &ops->rules_list, list) { if (rule->action && r->action != rule->action) continue; if (rule->table && r->table != rule->table) continue; if (user_priority && r->pref != rule->pref) continue; if (rule->iifname[0] && memcmp(r->iifname, rule->iifname, IFNAMSIZ)) continue; if (rule->oifname[0] && memcmp(r->oifname, rule->oifname, IFNAMSIZ)) continue; if (rule->mark && r->mark != rule->mark) continue; if (rule->suppress_ifgroup != -1 && r->suppress_ifgroup != rule->suppress_ifgroup) continue; if (rule->suppress_prefixlen != -1 && r->suppress_prefixlen != rule->suppress_prefixlen) continue; if (rule->mark_mask && r->mark_mask != rule->mark_mask) continue; if (rule->tun_id && r->tun_id != rule->tun_id) continue; if (rule->l3mdev && r->l3mdev != rule->l3mdev) continue; if (uid_range_set(&rule->uid_range) && (!uid_eq(r->uid_range.start, rule->uid_range.start) || !uid_eq(r->uid_range.end, rule->uid_range.end))) continue; if (rule->ip_proto && r->ip_proto != rule->ip_proto) continue; if (rule->proto && r->proto != rule->proto) continue; if (fib_rule_port_range_set(&rule->sport_range) && !fib_rule_port_range_compare(&r->sport_range, &rule->sport_range)) continue; if (rule->sport_mask && r->sport_mask != rule->sport_mask) continue; if (fib_rule_port_range_set(&rule->dport_range) && !fib_rule_port_range_compare(&r->dport_range, &rule->dport_range)) continue; if (rule->dport_mask && r->dport_mask != rule->dport_mask) continue; if (!ops->compare(r, frh, tb)) continue; return r; } return NULL; } #ifdef CONFIG_NET_L3_MASTER_DEV static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule, struct netlink_ext_ack *extack) { nlrule->l3mdev = nla_get_u8(nla); if (nlrule->l3mdev != 1) { NL_SET_ERR_MSG(extack, "Invalid l3mdev attribute"); return -1; } return 0; } #else static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule, struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "l3mdev support is not enabled in kernel"); return -1; } #endif static int fib_nl2rule_port_mask(const struct nlattr *mask_attr, const struct fib_rule_port_range *range, u16 *port_mask, struct netlink_ext_ack *extack) { if (!fib_rule_port_range_valid(range)) { NL_SET_ERR_MSG_ATTR(extack, mask_attr, "Cannot specify port mask without port value"); return -EINVAL; } if (fib_rule_port_is_range(range)) { NL_SET_ERR_MSG_ATTR(extack, mask_attr, "Cannot specify port mask for port range"); return -EINVAL; } if (range->start & ~nla_get_u16(mask_attr)) { NL_SET_ERR_MSG_ATTR(extack, mask_attr, "Invalid port mask"); return -EINVAL; } *port_mask = nla_get_u16(mask_attr); return 0; } static int fib_nl2rule(struct net *net, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, struct fib_rules_ops *ops, struct nlattr *tb[], struct fib_rule **rule, bool *user_priority) { struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rule *nlrule = NULL; int err = -EINVAL; if (frh->src_len) if (!tb[FRA_SRC] || frh->src_len > (ops->addr_size * 8) || nla_len(tb[FRA_SRC]) != ops->addr_size) { NL_SET_ERR_MSG(extack, "Invalid source address"); goto errout; } if (frh->dst_len) if (!tb[FRA_DST] || frh->dst_len > (ops->addr_size * 8) || nla_len(tb[FRA_DST]) != ops->addr_size) { NL_SET_ERR_MSG(extack, "Invalid dst address"); goto errout; } nlrule = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT); if (!nlrule) { err = -ENOMEM; goto errout; } refcount_set(&nlrule->refcnt, 1); nlrule->fr_net = net; if (tb[FRA_PRIORITY]) { nlrule->pref = nla_get_u32(tb[FRA_PRIORITY]); *user_priority = true; } nlrule->proto = nla_get_u8_default(tb[FRA_PROTOCOL], RTPROT_UNSPEC); if (tb[FRA_IIFNAME]) { nlrule->iifindex = -1; nla_strscpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); } if (tb[FRA_OIFNAME]) { nlrule->oifindex = -1; nla_strscpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); } if (tb[FRA_FWMARK]) { nlrule->mark = nla_get_u32(tb[FRA_FWMARK]); if (nlrule->mark) /* compatibility: if the mark value is non-zero all bits * are compared unless a mask is explicitly specified. */ nlrule->mark_mask = 0xFFFFFFFF; } if (tb[FRA_FWMASK]) nlrule->mark_mask = nla_get_u32(tb[FRA_FWMASK]); if (tb[FRA_TUN_ID]) nlrule->tun_id = nla_get_be64(tb[FRA_TUN_ID]); if (tb[FRA_L3MDEV] && fib_nl2rule_l3mdev(tb[FRA_L3MDEV], nlrule, extack) < 0) goto errout_free; nlrule->action = frh->action; nlrule->flags = frh->flags; nlrule->table = frh_get_table(frh, tb); if (tb[FRA_SUPPRESS_PREFIXLEN]) nlrule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]); else nlrule->suppress_prefixlen = -1; if (tb[FRA_SUPPRESS_IFGROUP]) nlrule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]); else nlrule->suppress_ifgroup = -1; if (tb[FRA_GOTO]) { if (nlrule->action != FR_ACT_GOTO) { NL_SET_ERR_MSG(extack, "Unexpected goto"); goto errout_free; } nlrule->target = nla_get_u32(tb[FRA_GOTO]); } else if (nlrule->action == FR_ACT_GOTO) { NL_SET_ERR_MSG(extack, "Missing goto target for action goto"); goto errout_free; } if (nlrule->l3mdev && nlrule->table) { NL_SET_ERR_MSG(extack, "l3mdev and table are mutually exclusive"); goto errout_free; } if (tb[FRA_UID_RANGE]) { if (current_user_ns() != net->user_ns) { err = -EPERM; NL_SET_ERR_MSG(extack, "No permission to set uid"); goto errout_free; } nlrule->uid_range = nla_get_kuid_range(tb); if (!uid_range_set(&nlrule->uid_range) || !uid_lte(nlrule->uid_range.start, nlrule->uid_range.end)) { NL_SET_ERR_MSG(extack, "Invalid uid range"); goto errout_free; } } else { nlrule->uid_range = fib_kuid_range_unset; } if (tb[FRA_IP_PROTO]) nlrule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]); if (tb[FRA_SPORT_RANGE]) { err = nla_get_port_range(tb[FRA_SPORT_RANGE], &nlrule->sport_range); if (err) { NL_SET_ERR_MSG(extack, "Invalid sport range"); goto errout_free; } if (!fib_rule_port_is_range(&nlrule->sport_range)) nlrule->sport_mask = U16_MAX; } if (tb[FRA_SPORT_MASK]) { err = fib_nl2rule_port_mask(tb[FRA_SPORT_MASK], &nlrule->sport_range, &nlrule->sport_mask, extack); if (err) goto errout_free; } if (tb[FRA_DPORT_RANGE]) { err = nla_get_port_range(tb[FRA_DPORT_RANGE], &nlrule->dport_range); if (err) { NL_SET_ERR_MSG(extack, "Invalid dport range"); goto errout_free; } if (!fib_rule_port_is_range(&nlrule->dport_range)) nlrule->dport_mask = U16_MAX; } if (tb[FRA_DPORT_MASK]) { err = fib_nl2rule_port_mask(tb[FRA_DPORT_MASK], &nlrule->dport_range, &nlrule->dport_mask, extack); if (err) goto errout_free; } *rule = nlrule; return 0; errout_free: kfree(nlrule); errout: return err; } static int fib_nl2rule_rtnl(struct fib_rule *nlrule, struct fib_rules_ops *ops, struct nlattr *tb[], struct netlink_ext_ack *extack) { if (!tb[FRA_PRIORITY]) nlrule->pref = fib_default_rule_pref(ops); /* Backward jumps are prohibited to avoid endless loops */ if (tb[FRA_GOTO] && nlrule->target <= nlrule->pref) { NL_SET_ERR_MSG(extack, "Backward goto not supported"); return -EINVAL; } if (tb[FRA_IIFNAME]) { struct net_device *dev; dev = __dev_get_by_name(nlrule->fr_net, nlrule->iifname); if (dev) { nlrule->iifindex = dev->ifindex; nlrule->iif_is_l3_master = netif_is_l3_master(dev); } } if (tb[FRA_OIFNAME]) { struct net_device *dev; dev = __dev_get_by_name(nlrule->fr_net, nlrule->oifname); if (dev) { nlrule->oifindex = dev->ifindex; nlrule->oif_is_l3_master = netif_is_l3_master(dev); } } return 0; } static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, struct nlattr **tb, struct fib_rule *rule) { struct fib_rule *r; list_for_each_entry(r, &ops->rules_list, list) { if (r->action != rule->action) continue; if (r->table != rule->table) continue; if (r->pref != rule->pref) continue; if (memcmp(r->iifname, rule->iifname, IFNAMSIZ)) continue; if (memcmp(r->oifname, rule->oifname, IFNAMSIZ)) continue; if (r->mark != rule->mark) continue; if (r->suppress_ifgroup != rule->suppress_ifgroup) continue; if (r->suppress_prefixlen != rule->suppress_prefixlen) continue; if (r->mark_mask != rule->mark_mask) continue; if (r->tun_id != rule->tun_id) continue; if (r->l3mdev != rule->l3mdev) continue; if (!uid_eq(r->uid_range.start, rule->uid_range.start) || !uid_eq(r->uid_range.end, rule->uid_range.end)) continue; if (r->ip_proto != rule->ip_proto) continue; if (r->proto != rule->proto) continue; if (!fib_rule_port_range_compare(&r->sport_range, &rule->sport_range)) continue; if (r->sport_mask != rule->sport_mask) continue; if (!fib_rule_port_range_compare(&r->dport_range, &rule->dport_range)) continue; if (r->dport_mask != rule->dport_mask) continue; if (!ops->compare(r, frh, tb)) continue; return 1; } return 0; } static const struct nla_policy fib_rule_policy[FRA_MAX + 1] = { [FRA_UNSPEC] = { .strict_start_type = FRA_DPORT_RANGE + 1 }, [FRA_IIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, [FRA_OIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, [FRA_PRIORITY] = { .type = NLA_U32 }, [FRA_FWMARK] = { .type = NLA_U32 }, [FRA_FLOW] = { .type = NLA_U32 }, [FRA_TUN_ID] = { .type = NLA_U64 }, [FRA_FWMASK] = { .type = NLA_U32 }, [FRA_TABLE] = { .type = NLA_U32 }, [FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, [FRA_GOTO] = { .type = NLA_U32 }, [FRA_L3MDEV] = { .type = NLA_U8 }, [FRA_UID_RANGE] = { .len = sizeof(struct fib_rule_uid_range) }, [FRA_PROTOCOL] = { .type = NLA_U8 }, [FRA_IP_PROTO] = { .type = NLA_U8 }, [FRA_SPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, [FRA_DSCP] = NLA_POLICY_MAX(NLA_U8, INET_DSCP_MASK >> 2), [FRA_FLOWLABEL] = { .type = NLA_BE32 }, [FRA_FLOWLABEL_MASK] = { .type = NLA_BE32 }, [FRA_SPORT_MASK] = { .type = NLA_U16 }, [FRA_DPORT_MASK] = { .type = NLA_U16 }, [FRA_DSCP_MASK] = NLA_POLICY_MASK(NLA_U8, INET_DSCP_MASK >> 2), }; int fib_newrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, bool rtnl_held) { struct fib_rule *rule = NULL, *r, *last = NULL; int err = -EINVAL, unresolved = 0; struct fib_rules_ops *ops = NULL; struct nlattr *tb[FRA_MAX + 1]; bool user_priority = false; struct fib_rule_hdr *frh; frh = nlmsg_payload(nlh, sizeof(*frh)); if (!frh) { NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; } ops = lookup_rules_ops(net, frh->family); if (!ops) { err = -EAFNOSUPPORT; NL_SET_ERR_MSG(extack, "Rule family not supported"); goto errout; } err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX, fib_rule_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Error parsing msg"); goto errout; } err = fib_nl2rule(net, nlh, extack, ops, tb, &rule, &user_priority); if (err) goto errout; if (!rtnl_held) rtnl_net_lock(net); err = fib_nl2rule_rtnl(rule, ops, tb, extack); if (err) goto errout_free; if ((nlh->nlmsg_flags & NLM_F_EXCL) && rule_exists(ops, frh, tb, rule)) { err = -EEXIST; goto errout_free; } err = ops->configure(rule, skb, frh, tb, extack); if (err < 0) goto errout_free; err = call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops, extack); if (err < 0) goto errout_free; list_for_each_entry(r, &ops->rules_list, list) { if (r->pref == rule->target) { RCU_INIT_POINTER(rule->ctarget, r); break; } } if (rcu_dereference_protected(rule->ctarget, 1) == NULL) unresolved = 1; list_for_each_entry(r, &ops->rules_list, list) { if (r->pref > rule->pref) break; last = r; } if (last) list_add_rcu(&rule->list, &last->list); else list_add_rcu(&rule->list, &ops->rules_list); if (ops->unresolved_rules) { /* * There are unresolved goto rules in the list, check if * any of them are pointing to this new rule. */ list_for_each_entry(r, &ops->rules_list, list) { if (r->action == FR_ACT_GOTO && r->target == rule->pref && rtnl_dereference(r->ctarget) == NULL) { rcu_assign_pointer(r->ctarget, rule); if (--ops->unresolved_rules == 0) break; } } } if (rule->action == FR_ACT_GOTO) ops->nr_goto_rules++; if (unresolved) ops->unresolved_rules++; if (rule->tun_id) ip_tunnel_need_metadata(); fib_rule_get(rule); if (!rtnl_held) rtnl_net_unlock(net); notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid); fib_rule_put(rule); flush_route_cache(ops); rules_ops_put(ops); return 0; errout_free: if (!rtnl_held) rtnl_net_unlock(net); kfree(rule); errout: rules_ops_put(ops); return err; } EXPORT_SYMBOL_GPL(fib_newrule); static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { return fib_newrule(sock_net(skb->sk), skb, nlh, extack, false); } int fib_delrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, bool rtnl_held) { struct fib_rule *rule = NULL, *nlrule = NULL; struct fib_rules_ops *ops = NULL; struct nlattr *tb[FRA_MAX+1]; bool user_priority = false; struct fib_rule_hdr *frh; int err = -EINVAL; frh = nlmsg_payload(nlh, sizeof(*frh)); if (!frh) { NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; } ops = lookup_rules_ops(net, frh->family); if (ops == NULL) { err = -EAFNOSUPPORT; NL_SET_ERR_MSG(extack, "Rule family not supported"); goto errout; } err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX, fib_rule_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Error parsing msg"); goto errout; } err = fib_nl2rule(net, nlh, extack, ops, tb, &nlrule, &user_priority); if (err) goto errout; if (!rtnl_held) rtnl_net_lock(net); err = fib_nl2rule_rtnl(nlrule, ops, tb, extack); if (err) goto errout_free; rule = rule_find(ops, frh, tb, nlrule, user_priority); if (!rule) { err = -ENOENT; goto errout_free; } if (rule->flags & FIB_RULE_PERMANENT) { err = -EPERM; goto errout_free; } if (ops->delete) { err = ops->delete(rule); if (err) goto errout_free; } if (rule->tun_id) ip_tunnel_unneed_metadata(); list_del_rcu(&rule->list); if (rule->action == FR_ACT_GOTO) { ops->nr_goto_rules--; if (rtnl_dereference(rule->ctarget) == NULL) ops->unresolved_rules--; } /* * Check if this rule is a target to any of them. If so, * adjust to the next one with the same preference or * disable them. As this operation is eventually very * expensive, it is only performed if goto rules, except * current if it is goto rule, have actually been added. */ if (ops->nr_goto_rules > 0) { struct fib_rule *n, *r; n = list_next_entry(rule, list); if (&n->list == &ops->rules_list || n->pref != rule->pref) n = NULL; list_for_each_entry(r, &ops->rules_list, list) { if (rtnl_dereference(r->ctarget) != rule) continue; rcu_assign_pointer(r->ctarget, n); if (!n) ops->unresolved_rules++; } } call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, NULL); if (!rtnl_held) rtnl_net_unlock(net); notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).portid); fib_rule_put(rule); flush_route_cache(ops); rules_ops_put(ops); kfree(nlrule); return 0; errout_free: if (!rtnl_held) rtnl_net_unlock(net); kfree(nlrule); errout: rules_ops_put(ops); return err; } EXPORT_SYMBOL_GPL(fib_delrule); static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { return fib_delrule(sock_net(skb->sk), skb, nlh, extack, false); } static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, struct fib_rule *rule) { size_t payload = NLMSG_ALIGN(sizeof(struct fib_rule_hdr)) + nla_total_size(IFNAMSIZ) /* FRA_IIFNAME */ + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */ + nla_total_size(4) /* FRA_PRIORITY */ + nla_total_size(4) /* FRA_TABLE */ + nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */ + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */ + nla_total_size(4) /* FRA_FWMARK */ + nla_total_size(4) /* FRA_FWMASK */ + nla_total_size_64bit(8) /* FRA_TUN_ID */ + nla_total_size(sizeof(struct fib_kuid_range)) + nla_total_size(1) /* FRA_PROTOCOL */ + nla_total_size(1) /* FRA_IP_PROTO */ + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_SPORT_RANGE */ + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_DPORT_RANGE */ + nla_total_size(2) /* FRA_SPORT_MASK */ + nla_total_size(2); /* FRA_DPORT_MASK */ if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); return payload; } static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, u32 pid, u32 seq, int type, int flags, struct fib_rules_ops *ops) { struct nlmsghdr *nlh; struct fib_rule_hdr *frh; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*frh), flags); if (nlh == NULL) return -EMSGSIZE; frh = nlmsg_data(nlh); frh->family = ops->family; frh->table = rule->table < 256 ? rule->table : RT_TABLE_COMPAT; if (nla_put_u32(skb, FRA_TABLE, rule->table)) goto nla_put_failure; if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen)) goto nla_put_failure; frh->res1 = 0; frh->res2 = 0; frh->action = rule->action; frh->flags = rule->flags; if (nla_put_u8(skb, FRA_PROTOCOL, rule->proto)) goto nla_put_failure; if (rule->action == FR_ACT_GOTO && rcu_access_pointer(rule->ctarget) == NULL) frh->flags |= FIB_RULE_UNRESOLVED; if (rule->iifname[0]) { if (nla_put_string(skb, FRA_IIFNAME, rule->iifname)) goto nla_put_failure; if (READ_ONCE(rule->iifindex) == -1) frh->flags |= FIB_RULE_IIF_DETACHED; } if (rule->oifname[0]) { if (nla_put_string(skb, FRA_OIFNAME, rule->oifname)) goto nla_put_failure; if (READ_ONCE(rule->oifindex) == -1) frh->flags |= FIB_RULE_OIF_DETACHED; } if ((rule->pref && nla_put_u32(skb, FRA_PRIORITY, rule->pref)) || (rule->mark && nla_put_u32(skb, FRA_FWMARK, rule->mark)) || ((rule->mark_mask || rule->mark) && nla_put_u32(skb, FRA_FWMASK, rule->mark_mask)) || (rule->target && nla_put_u32(skb, FRA_GOTO, rule->target)) || (rule->tun_id && nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)) || (rule->l3mdev && nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)) || (uid_range_set(&rule->uid_range) && nla_put_uid_range(skb, &rule->uid_range)) || (fib_rule_port_range_set(&rule->sport_range) && nla_put_port_range(skb, FRA_SPORT_RANGE, &rule->sport_range)) || (rule->sport_mask && nla_put_u16(skb, FRA_SPORT_MASK, rule->sport_mask)) || (fib_rule_port_range_set(&rule->dport_range) && nla_put_port_range(skb, FRA_DPORT_RANGE, &rule->dport_range)) || (rule->dport_mask && nla_put_u16(skb, FRA_DPORT_MASK, rule->dport_mask)) || (rule->ip_proto && nla_put_u8(skb, FRA_IP_PROTO, rule->ip_proto))) goto nla_put_failure; if (rule->suppress_ifgroup != -1) { if (nla_put_u32(skb, FRA_SUPPRESS_IFGROUP, rule->suppress_ifgroup)) goto nla_put_failure; } if (ops->fill(rule, skb, frh) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb, struct fib_rules_ops *ops) { int idx = 0; struct fib_rule *rule; int err = 0; rcu_read_lock(); list_for_each_entry_rcu(rule, &ops->rules_list, list) { if (idx < cb->args[1]) goto skip; err = fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWRULE, NLM_F_MULTI, ops); if (err) break; skip: idx++; } rcu_read_unlock(); cb->args[1] = idx; rules_ops_put(ops); return err; } static int fib_valid_dumprule_req(const struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct fib_rule_hdr *frh; frh = nlmsg_payload(nlh, sizeof(*frh)); if (!frh) { NL_SET_ERR_MSG(extack, "Invalid header for fib rule dump request"); return -EINVAL; } if (frh->dst_len || frh->src_len || frh->tos || frh->table || frh->res1 || frh->res2 || frh->action || frh->flags) { NL_SET_ERR_MSG(extack, "Invalid values in header for fib rule dump request"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*frh))) { NL_SET_ERR_MSG(extack, "Invalid data after header in fib rule dump request"); return -EINVAL; } return 0; } static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct fib_rules_ops *ops; int err, idx = 0, family; if (cb->strict_check) { err = fib_valid_dumprule_req(nlh, cb->extack); if (err < 0) return err; } family = rtnl_msg_family(nlh); if (family != AF_UNSPEC) { /* Protocol specific dump request */ ops = lookup_rules_ops(net, family); if (ops == NULL) return -EAFNOSUPPORT; return dump_rules(skb, cb, ops); } err = 0; rcu_read_lock(); list_for_each_entry_rcu(ops, &net->rules_ops, list) { if (idx < cb->args[0] || !try_module_get(ops->owner)) goto skip; err = dump_rules(skb, cb, ops); if (err < 0) break; cb->args[1] = 0; skip: idx++; } rcu_read_unlock(); cb->args[0] = idx; return err; } static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, u32 pid) { struct net *net; struct sk_buff *skb; int err = -ENOMEM; net = ops->fro_net; skb = nlmsg_new(fib_rule_nlmsg_size(ops, rule), GFP_KERNEL); if (skb == NULL) goto errout; err = fib_nl_fill_rule(skb, rule, pid, nlh->nlmsg_seq, event, 0, ops); if (err < 0) { /* -EMSGSIZE implies BUG in fib_rule_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, ops->nlgroup, err); } static void attach_rules(struct list_head *rules, struct net_device *dev) { struct fib_rule *rule; list_for_each_entry(rule, rules, list) { if (rule->iifindex == -1 && strcmp(dev->name, rule->iifname) == 0) { WRITE_ONCE(rule->iifindex, dev->ifindex); WRITE_ONCE(rule->iif_is_l3_master, netif_is_l3_master(dev)); } if (rule->oifindex == -1 && strcmp(dev->name, rule->oifname) == 0) { WRITE_ONCE(rule->oifindex, dev->ifindex); WRITE_ONCE(rule->oif_is_l3_master, netif_is_l3_master(dev)); } } } static void detach_rules(struct list_head *rules, struct net_device *dev) { struct fib_rule *rule; list_for_each_entry(rule, rules, list) { if (rule->iifindex == dev->ifindex) { WRITE_ONCE(rule->iifindex, -1); WRITE_ONCE(rule->iif_is_l3_master, false); } if (rule->oifindex == dev->ifindex) { WRITE_ONCE(rule->oifindex, -1); WRITE_ONCE(rule->oif_is_l3_master, false); } } } static int fib_rules_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct fib_rules_ops *ops; ASSERT_RTNL(); switch (event) { case NETDEV_REGISTER: list_for_each_entry(ops, &net->rules_ops, list) attach_rules(&ops->rules_list, dev); break; case NETDEV_CHANGENAME: list_for_each_entry(ops, &net->rules_ops, list) { detach_rules(&ops->rules_list, dev); attach_rules(&ops->rules_list, dev); } break; case NETDEV_UNREGISTER: list_for_each_entry(ops, &net->rules_ops, list) detach_rules(&ops->rules_list, dev); break; } return NOTIFY_DONE; } static struct notifier_block fib_rules_notifier = { .notifier_call = fib_rules_event, }; static int __net_init fib_rules_net_init(struct net *net) { INIT_LIST_HEAD(&net->rules_ops); spin_lock_init(&net->rules_mod_lock); return 0; } static void __net_exit fib_rules_net_exit(struct net *net) { WARN_ON_ONCE(!list_empty(&net->rules_ops)); } static struct pernet_operations fib_rules_net_ops = { .init = fib_rules_net_init, .exit = fib_rules_net_exit, }; static const struct rtnl_msg_handler fib_rules_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_NEWRULE, .doit = fib_nl_newrule, .flags = RTNL_FLAG_DOIT_PERNET}, {.msgtype = RTM_DELRULE, .doit = fib_nl_delrule, .flags = RTNL_FLAG_DOIT_PERNET}, {.msgtype = RTM_GETRULE, .dumpit = fib_nl_dumprule, .flags = RTNL_FLAG_DUMP_UNLOCKED}, }; static int __init fib_rules_init(void) { int err; rtnl_register_many(fib_rules_rtnl_msg_handlers); err = register_pernet_subsys(&fib_rules_net_ops); if (err < 0) goto fail; err = register_netdevice_notifier(&fib_rules_notifier); if (err < 0) goto fail_unregister; return 0; fail_unregister: unregister_pernet_subsys(&fib_rules_net_ops); fail: rtnl_unregister_many(fib_rules_rtnl_msg_handlers); return err; } subsys_initcall(fib_rules_init);
30 5 20 15 21 21 10 10 15 6 6 14 14 14 3 3 4 2 1 1 5 5 17 13 13 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 /* * Resizable simple ram filesystem for Linux. * * Copyright (C) 2000 Linus Torvalds. * 2000 Transmeta Corp. * * Usage limits added by David Gibson, Linuxcare Australia. * This file is released under the GPL. */ /* * NOTE! This filesystem is probably most useful * not as a real filesystem, but as an example of * how virtual filesystems can be written. * * It doesn't get much simpler than this. Consider * that this file implements the full semantics of * a POSIX-compliant read-write filesystem. * * Note in particular how the filesystem does not * need to implement any data structures of its own * to keep track of the virtual data: using the VFS * caches is sufficient. */ #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/time.h> #include <linux/init.h> #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/ramfs.h> #include <linux/sched.h> #include <linux/parser.h> #include <linux/magic.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/seq_file.h> #include "internal.h" struct ramfs_mount_opts { umode_t mode; }; struct ramfs_fs_info { struct ramfs_mount_opts mount_opts; }; #define RAMFS_DEFAULT_MODE 0755 static const struct super_operations ramfs_ops; static const struct inode_operations ramfs_dir_inode_operations; struct inode *ramfs_get_inode(struct super_block *sb, const struct inode *dir, umode_t mode, dev_t dev) { struct inode * inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_mapping->a_ops = &ram_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); mapping_set_unevictable(inode->i_mapping); simple_inode_init_ts(inode); switch (mode & S_IFMT) { default: init_special_inode(inode, mode, dev); break; case S_IFREG: inode->i_op = &ramfs_file_inode_operations; inode->i_fop = &ramfs_file_operations; break; case S_IFDIR: inode->i_op = &ramfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); break; case S_IFLNK: inode->i_op = &page_symlink_inode_operations; inode_nohighmem(inode); break; } } return inode; } /* * File creation. Allocate an inode, and we're done.. */ /* SMP-safe */ static int ramfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev); int error = -ENOSPC; if (inode) { error = security_inode_init_security(inode, dir, &dentry->d_name, NULL, NULL); if (error) { iput(inode); goto out; } d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ error = 0; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); } out: return error; } static struct dentry *ramfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { int retval = ramfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0); if (!retval) inc_nlink(dir); return ERR_PTR(retval); } static int ramfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return ramfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFREG, 0); } static int ramfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct inode *inode; int error = -ENOSPC; inode = ramfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0); if (inode) { int l = strlen(symname)+1; error = security_inode_init_security(inode, dir, &dentry->d_name, NULL, NULL); if (error) { iput(inode); goto out; } error = page_symlink(inode, symname, l); if (!error) { d_instantiate(dentry, inode); dget(dentry); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); } else iput(inode); } out: return error; } static int ramfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct inode *inode; int error; inode = ramfs_get_inode(dir->i_sb, dir, mode, 0); if (!inode) return -ENOSPC; error = security_inode_init_security(inode, dir, &file_dentry(file)->d_name, NULL, NULL); if (error) { iput(inode); goto out; } d_tmpfile(file, inode); out: return finish_open_simple(file, error); } static const struct inode_operations ramfs_dir_inode_operations = { .create = ramfs_create, .lookup = simple_lookup, .link = simple_link, .unlink = simple_unlink, .symlink = ramfs_symlink, .mkdir = ramfs_mkdir, .rmdir = simple_rmdir, .mknod = ramfs_mknod, .rename = simple_rename, .tmpfile = ramfs_tmpfile, }; /* * Display the mount options in /proc/mounts. */ static int ramfs_show_options(struct seq_file *m, struct dentry *root) { struct ramfs_fs_info *fsi = root->d_sb->s_fs_info; if (fsi->mount_opts.mode != RAMFS_DEFAULT_MODE) seq_printf(m, ",mode=%o", fsi->mount_opts.mode); return 0; } static const struct super_operations ramfs_ops = { .statfs = simple_statfs, .drop_inode = inode_just_drop, .show_options = ramfs_show_options, }; enum ramfs_param { Opt_mode, }; const struct fs_parameter_spec ramfs_fs_parameters[] = { fsparam_u32oct("mode", Opt_mode), {} }; static int ramfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct fs_parse_result result; struct ramfs_fs_info *fsi = fc->s_fs_info; int opt; opt = fs_parse(fc, ramfs_fs_parameters, param, &result); if (opt == -ENOPARAM) { opt = vfs_parse_fs_param_source(fc, param); if (opt != -ENOPARAM) return opt; /* * We might like to report bad mount options here; * but traditionally ramfs has ignored all mount options, * and as it is used as a !CONFIG_SHMEM simple substitute * for tmpfs, better continue to ignore other mount options. */ return 0; } if (opt < 0) return opt; switch (opt) { case Opt_mode: fsi->mount_opts.mode = result.uint_32 & S_IALLUGO; break; } return 0; } static int ramfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct ramfs_fs_info *fsi = sb->s_fs_info; struct inode *inode; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = RAMFS_MAGIC; sb->s_op = &ramfs_ops; sb->s_d_flags = DCACHE_DONTCACHE; sb->s_time_gran = 1; inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0); sb->s_root = d_make_root(inode); if (!sb->s_root) return -ENOMEM; return 0; } static int ramfs_get_tree(struct fs_context *fc) { return get_tree_nodev(fc, ramfs_fill_super); } static void ramfs_free_fc(struct fs_context *fc) { kfree(fc->s_fs_info); } static const struct fs_context_operations ramfs_context_ops = { .free = ramfs_free_fc, .parse_param = ramfs_parse_param, .get_tree = ramfs_get_tree, }; int ramfs_init_fs_context(struct fs_context *fc) { struct ramfs_fs_info *fsi; fsi = kzalloc(sizeof(*fsi), GFP_KERNEL); if (!fsi) return -ENOMEM; fsi->mount_opts.mode = RAMFS_DEFAULT_MODE; fc->s_fs_info = fsi; fc->ops = &ramfs_context_ops; return 0; } void ramfs_kill_sb(struct super_block *sb) { kfree(sb->s_fs_info); kill_litter_super(sb); } static struct file_system_type ramfs_fs_type = { .name = "ramfs", .init_fs_context = ramfs_init_fs_context, .parameters = ramfs_fs_parameters, .kill_sb = ramfs_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; static int __init init_ramfs_fs(void) { return register_filesystem(&ramfs_fs_type); } fs_initcall(init_ramfs_fs);
276 273 7 1 1 1 5 6 2 5 6 16 16 5 5 5 5 19 4 16 4 17 299 299 276 276 243 243 243 242 240 240 17 1 16 16 15 16 10 4 16 16 15 11 15 11 16 301 300 301 279 277 243 23 243 240 240 218 218 219 218 219 24 301 276 299 301 276 301 301 299 276 268 299 269 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 // SPDX-License-Identifier: GPL-2.0-or-later /* * Information interface for ALSA driver * Copyright (c) by Jaroslav Kysela <perex@perex.cz> */ #include <linux/init.h> #include <linux/time.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/module.h> #include <sound/core.h> #include <sound/minors.h> #include <sound/info.h> #include <linux/utsname.h> #include <linux/proc_fs.h> #include <linux/mutex.h> int snd_info_check_reserved_words(const char *str) { static const char * const reserved[] = { "version", "meminfo", "memdebug", "detect", "devices", "oss", "cards", "timers", "synth", "pcm", "seq", NULL }; const char * const *xstr = reserved; while (*xstr) { if (!strcmp(*xstr, str)) return 0; xstr++; } if (!strncmp(str, "card", 4)) return 0; return 1; } static DEFINE_MUTEX(info_mutex); struct snd_info_private_data { struct snd_info_buffer *rbuffer; struct snd_info_buffer *wbuffer; struct snd_info_entry *entry; void *file_private_data; }; static int snd_info_version_init(void); static void snd_info_clear_entries(struct snd_info_entry *entry); /* */ static struct snd_info_entry *snd_proc_root; struct snd_info_entry *snd_seq_root; EXPORT_SYMBOL(snd_seq_root); #ifdef CONFIG_SND_OSSEMUL struct snd_info_entry *snd_oss_root; #endif static int alloc_info_private(struct snd_info_entry *entry, struct snd_info_private_data **ret) { struct snd_info_private_data *data; if (!entry || !entry->p) return -ENODEV; if (!try_module_get(entry->module)) return -EFAULT; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) { module_put(entry->module); return -ENOMEM; } data->entry = entry; *ret = data; return 0; } static bool valid_pos(loff_t pos, size_t count) { if (pos < 0 || (long) pos != pos || (ssize_t) count < 0) return false; if ((unsigned long) pos + (unsigned long) count < (unsigned long) pos) return false; return true; } /* * file ops for binary proc files */ static loff_t snd_info_entry_llseek(struct file *file, loff_t offset, int orig) { struct snd_info_private_data *data; struct snd_info_entry *entry; loff_t size; data = file->private_data; entry = data->entry; guard(mutex)(&entry->access); if (entry->c.ops->llseek) return entry->c.ops->llseek(entry, data->file_private_data, file, offset, orig); size = entry->size; switch (orig) { case SEEK_SET: break; case SEEK_CUR: offset += file->f_pos; break; case SEEK_END: if (!size) return -EINVAL; offset += size; break; default: return -EINVAL; } if (offset < 0) return -EINVAL; if (size && offset > size) offset = size; file->f_pos = offset; return offset; } static ssize_t snd_info_entry_read(struct file *file, char __user *buffer, size_t count, loff_t * offset) { struct snd_info_private_data *data = file->private_data; struct snd_info_entry *entry = data->entry; size_t size; loff_t pos; pos = *offset; if (!valid_pos(pos, count)) return -EIO; if (pos >= entry->size) return 0; size = entry->size - pos; size = min(count, size); size = entry->c.ops->read(entry, data->file_private_data, file, buffer, size, pos); if ((ssize_t) size > 0) *offset = pos + size; return size; } static ssize_t snd_info_entry_write(struct file *file, const char __user *buffer, size_t count, loff_t * offset) { struct snd_info_private_data *data = file->private_data; struct snd_info_entry *entry = data->entry; ssize_t size = 0; loff_t pos; pos = *offset; if (!valid_pos(pos, count)) return -EIO; if (count > 0) { size_t maxsize = entry->size - pos; count = min(count, maxsize); size = entry->c.ops->write(entry, data->file_private_data, file, buffer, count, pos); } if (size > 0) *offset = pos + size; return size; } static __poll_t snd_info_entry_poll(struct file *file, poll_table *wait) { struct snd_info_private_data *data = file->private_data; struct snd_info_entry *entry = data->entry; __poll_t mask = 0; if (entry->c.ops->poll) return entry->c.ops->poll(entry, data->file_private_data, file, wait); if (entry->c.ops->read) mask |= EPOLLIN | EPOLLRDNORM; if (entry->c.ops->write) mask |= EPOLLOUT | EPOLLWRNORM; return mask; } static long snd_info_entry_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct snd_info_private_data *data = file->private_data; struct snd_info_entry *entry = data->entry; if (!entry->c.ops->ioctl) return -ENOTTY; return entry->c.ops->ioctl(entry, data->file_private_data, file, cmd, arg); } static int snd_info_entry_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); struct snd_info_private_data *data; struct snd_info_entry *entry; data = file->private_data; if (data == NULL) return 0; entry = data->entry; if (!entry->c.ops->mmap) return -ENXIO; return entry->c.ops->mmap(entry, data->file_private_data, inode, file, vma); } static int snd_info_entry_open(struct inode *inode, struct file *file) { struct snd_info_entry *entry = pde_data(inode); struct snd_info_private_data *data; int mode, err; guard(mutex)(&info_mutex); err = alloc_info_private(entry, &data); if (err < 0) return err; mode = file->f_flags & O_ACCMODE; if (((mode == O_RDONLY || mode == O_RDWR) && !entry->c.ops->read) || ((mode == O_WRONLY || mode == O_RDWR) && !entry->c.ops->write)) { err = -ENODEV; goto error; } if (entry->c.ops->open) { err = entry->c.ops->open(entry, mode, &data->file_private_data); if (err < 0) goto error; } file->private_data = data; return 0; error: kfree(data); module_put(entry->module); return err; } static int snd_info_entry_release(struct inode *inode, struct file *file) { struct snd_info_private_data *data = file->private_data; struct snd_info_entry *entry = data->entry; if (entry->c.ops->release) entry->c.ops->release(entry, file->f_flags & O_ACCMODE, data->file_private_data); module_put(entry->module); kfree(data); return 0; } static const struct proc_ops snd_info_entry_operations = { .proc_lseek = snd_info_entry_llseek, .proc_read = snd_info_entry_read, .proc_write = snd_info_entry_write, .proc_poll = snd_info_entry_poll, .proc_ioctl = snd_info_entry_ioctl, .proc_mmap = snd_info_entry_mmap, .proc_open = snd_info_entry_open, .proc_release = snd_info_entry_release, }; /* * file ops for text proc files */ static ssize_t snd_info_text_entry_write(struct file *file, const char __user *buffer, size_t count, loff_t *offset) { struct seq_file *m = file->private_data; struct snd_info_private_data *data = m->private; struct snd_info_entry *entry = data->entry; struct snd_info_buffer *buf; loff_t pos; size_t next; if (!entry->c.text.write) return -EIO; pos = *offset; if (!valid_pos(pos, count)) return -EIO; next = pos + count; /* don't handle too large text inputs */ if (next > 16 * 1024) return -EIO; guard(mutex)(&entry->access); buf = data->wbuffer; if (!buf) { data->wbuffer = buf = kzalloc(sizeof(*buf), GFP_KERNEL); if (!buf) return -ENOMEM; } if (next > buf->len) { char *nbuf = kvzalloc(PAGE_ALIGN(next), GFP_KERNEL); if (!nbuf) return -ENOMEM; kvfree(buf->buffer); buf->buffer = nbuf; buf->len = PAGE_ALIGN(next); } if (copy_from_user(buf->buffer + pos, buffer, count)) return -EFAULT; buf->size = next; *offset = next; return count; } static int snd_info_seq_show(struct seq_file *seq, void *p) { struct snd_info_private_data *data = seq->private; struct snd_info_entry *entry = data->entry; if (!entry->c.text.read) { return -EIO; } else { data->rbuffer->buffer = (char *)seq; /* XXX hack! */ entry->c.text.read(entry, data->rbuffer); } return 0; } static int snd_info_text_entry_open(struct inode *inode, struct file *file) { struct snd_info_entry *entry = pde_data(inode); struct snd_info_private_data *data; int err; guard(mutex)(&info_mutex); err = alloc_info_private(entry, &data); if (err < 0) return err; data->rbuffer = kzalloc(sizeof(*data->rbuffer), GFP_KERNEL); if (!data->rbuffer) { err = -ENOMEM; goto error; } if (entry->size) err = single_open_size(file, snd_info_seq_show, data, entry->size); else err = single_open(file, snd_info_seq_show, data); if (err < 0) goto error; return 0; error: kfree(data->rbuffer); kfree(data); module_put(entry->module); return err; } static int snd_info_text_entry_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; struct snd_info_private_data *data = m->private; struct snd_info_entry *entry = data->entry; if (data->wbuffer && entry->c.text.write) entry->c.text.write(entry, data->wbuffer); single_release(inode, file); kfree(data->rbuffer); if (data->wbuffer) { kvfree(data->wbuffer->buffer); kfree(data->wbuffer); } module_put(entry->module); kfree(data); return 0; } static const struct proc_ops snd_info_text_entry_ops = { .proc_open = snd_info_text_entry_open, .proc_release = snd_info_text_entry_release, .proc_write = snd_info_text_entry_write, .proc_lseek = seq_lseek, .proc_read = seq_read, }; static struct snd_info_entry *create_subdir(struct module *mod, const char *name) { struct snd_info_entry *entry; entry = snd_info_create_module_entry(mod, name, NULL); if (!entry) return NULL; entry->mode = S_IFDIR | 0555; if (snd_info_register(entry) < 0) { snd_info_free_entry(entry); return NULL; } return entry; } static struct snd_info_entry * snd_info_create_entry(const char *name, struct snd_info_entry *parent, struct module *module); int __init snd_info_init(void) { snd_proc_root = snd_info_create_entry("asound", NULL, THIS_MODULE); if (!snd_proc_root) return -ENOMEM; snd_proc_root->mode = S_IFDIR | 0555; snd_proc_root->p = proc_mkdir("asound", NULL); if (!snd_proc_root->p) goto error; #ifdef CONFIG_SND_OSSEMUL snd_oss_root = create_subdir(THIS_MODULE, "oss"); if (!snd_oss_root) goto error; #endif #if IS_ENABLED(CONFIG_SND_SEQUENCER) snd_seq_root = create_subdir(THIS_MODULE, "seq"); if (!snd_seq_root) goto error; #endif if (snd_info_version_init() < 0 || snd_minor_info_init() < 0 || snd_minor_info_oss_init() < 0 || snd_card_info_init() < 0 || snd_info_minor_register() < 0) goto error; return 0; error: snd_info_free_entry(snd_proc_root); return -ENOMEM; } int __exit snd_info_done(void) { snd_info_free_entry(snd_proc_root); return 0; } static void snd_card_id_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { struct snd_card *card = entry->private_data; snd_iprintf(buffer, "%s\n", card->id); } /* * create a card proc file * called from init.c */ int snd_info_card_create(struct snd_card *card) { char str[8]; struct snd_info_entry *entry; if (snd_BUG_ON(!card)) return -ENXIO; sprintf(str, "card%i", card->number); entry = create_subdir(card->module, str); if (!entry) return -ENOMEM; card->proc_root = entry; return snd_card_ro_proc_new(card, "id", card, snd_card_id_read); } /* * register the card proc file * called from init.c * can be called multiple times for reinitialization */ int snd_info_card_register(struct snd_card *card) { struct proc_dir_entry *p; int err; if (snd_BUG_ON(!card)) return -ENXIO; err = snd_info_register(card->proc_root); if (err < 0) return err; if (!strcmp(card->id, card->proc_root->name)) return 0; if (card->proc_root_link) return 0; p = proc_symlink(card->id, snd_proc_root->p, card->proc_root->name); if (!p) return -ENOMEM; card->proc_root_link = p; return 0; } /* * called on card->id change */ void snd_info_card_id_change(struct snd_card *card) { guard(mutex)(&info_mutex); if (card->proc_root_link) { proc_remove(card->proc_root_link); card->proc_root_link = NULL; } if (strcmp(card->id, card->proc_root->name)) card->proc_root_link = proc_symlink(card->id, snd_proc_root->p, card->proc_root->name); } /* * de-register the card proc file * called from init.c */ void snd_info_card_disconnect(struct snd_card *card) { if (!card) return; proc_remove(card->proc_root_link); if (card->proc_root) proc_remove(card->proc_root->p); guard(mutex)(&info_mutex); if (card->proc_root) snd_info_clear_entries(card->proc_root); card->proc_root_link = NULL; card->proc_root = NULL; } /* * release the card proc file resources * called from init.c */ int snd_info_card_free(struct snd_card *card) { if (!card) return 0; snd_info_free_entry(card->proc_root); card->proc_root = NULL; return 0; } /** * snd_info_get_line - read one line from the procfs buffer * @buffer: the procfs buffer * @line: the buffer to store * @len: the max. buffer size * * Reads one line from the buffer and stores the string. * * Return: Zero if successful, or 1 if error or EOF. */ int snd_info_get_line(struct snd_info_buffer *buffer, char *line, int len) { int c; if (snd_BUG_ON(!buffer)) return 1; if (!buffer->buffer) return 1; if (len <= 0 || buffer->stop || buffer->error) return 1; while (!buffer->stop) { c = buffer->buffer[buffer->curr++]; if (buffer->curr >= buffer->size) buffer->stop = 1; if (c == '\n') break; if (len > 1) { len--; *line++ = c; } } *line = '\0'; return 0; } EXPORT_SYMBOL(snd_info_get_line); /** * snd_info_get_str - parse a string token * @dest: the buffer to store the string token * @src: the original string * @len: the max. length of token - 1 * * Parses the original string and copy a token to the given * string buffer. * * Return: The updated pointer of the original string so that * it can be used for the next call. */ const char *snd_info_get_str(char *dest, const char *src, int len) { int c; while (*src == ' ' || *src == '\t') src++; if (*src == '"' || *src == '\'') { c = *src++; while (--len > 0 && *src && *src != c) { *dest++ = *src++; } if (*src == c) src++; } else { while (--len > 0 && *src && *src != ' ' && *src != '\t') { *dest++ = *src++; } } *dest = 0; while (*src == ' ' || *src == '\t') src++; return src; } EXPORT_SYMBOL(snd_info_get_str); /* * snd_info_create_entry - create an info entry * @name: the proc file name * @parent: the parent directory * * Creates an info entry with the given file name and initializes as * the default state. * * Usually called from other functions such as * snd_info_create_card_entry(). * * Return: The pointer of the new instance, or %NULL on failure. */ static struct snd_info_entry * snd_info_create_entry(const char *name, struct snd_info_entry *parent, struct module *module) { struct snd_info_entry *entry; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (entry == NULL) return NULL; entry->name = kstrdup(name, GFP_KERNEL); if (entry->name == NULL) { kfree(entry); return NULL; } entry->mode = S_IFREG | 0444; entry->content = SNDRV_INFO_CONTENT_TEXT; mutex_init(&entry->access); INIT_LIST_HEAD(&entry->children); INIT_LIST_HEAD(&entry->list); entry->parent = parent; entry->module = module; if (parent) { guard(mutex)(&parent->access); list_add_tail(&entry->list, &parent->children); } return entry; } /** * snd_info_create_module_entry - create an info entry for the given module * @module: the module pointer * @name: the file name * @parent: the parent directory * * Creates a new info entry and assigns it to the given module. * * Return: The pointer of the new instance, or %NULL on failure. */ struct snd_info_entry *snd_info_create_module_entry(struct module * module, const char *name, struct snd_info_entry *parent) { if (!parent) parent = snd_proc_root; return snd_info_create_entry(name, parent, module); } EXPORT_SYMBOL(snd_info_create_module_entry); /** * snd_info_create_card_entry - create an info entry for the given card * @card: the card instance * @name: the file name * @parent: the parent directory * * Creates a new info entry and assigns it to the given card. * * Return: The pointer of the new instance, or %NULL on failure. */ struct snd_info_entry *snd_info_create_card_entry(struct snd_card *card, const char *name, struct snd_info_entry * parent) { if (!parent) parent = card->proc_root; return snd_info_create_entry(name, parent, card->module); } EXPORT_SYMBOL(snd_info_create_card_entry); static void snd_info_clear_entries(struct snd_info_entry *entry) { struct snd_info_entry *p; if (!entry->p) return; list_for_each_entry(p, &entry->children, list) snd_info_clear_entries(p); entry->p = NULL; } /** * snd_info_free_entry - release the info entry * @entry: the info entry * * Releases the info entry. */ void snd_info_free_entry(struct snd_info_entry * entry) { struct snd_info_entry *p, *n; if (!entry) return; if (entry->p) { proc_remove(entry->p); guard(mutex)(&info_mutex); snd_info_clear_entries(entry); } /* free all children at first */ list_for_each_entry_safe(p, n, &entry->children, list) snd_info_free_entry(p); p = entry->parent; if (p) { guard(mutex)(&p->access); list_del(&entry->list); } kfree(entry->name); if (entry->private_free) entry->private_free(entry); kfree(entry); } EXPORT_SYMBOL(snd_info_free_entry); static int __snd_info_register(struct snd_info_entry *entry) { struct proc_dir_entry *root, *p = NULL; if (snd_BUG_ON(!entry)) return -ENXIO; root = entry->parent == NULL ? snd_proc_root->p : entry->parent->p; guard(mutex)(&info_mutex); if (entry->p || !root) return 0; if (S_ISDIR(entry->mode)) { p = proc_mkdir_mode(entry->name, entry->mode, root); if (!p) return -ENOMEM; } else { const struct proc_ops *ops; if (entry->content == SNDRV_INFO_CONTENT_DATA) ops = &snd_info_entry_operations; else ops = &snd_info_text_entry_ops; p = proc_create_data(entry->name, entry->mode, root, ops, entry); if (!p) return -ENOMEM; proc_set_size(p, entry->size); } entry->p = p; return 0; } /** * snd_info_register - register the info entry * @entry: the info entry * * Registers the proc info entry. * The all children entries are registered recursively. * * Return: Zero if successful, or a negative error code on failure. */ int snd_info_register(struct snd_info_entry *entry) { struct snd_info_entry *p; int err; if (!entry->p) { err = __snd_info_register(entry); if (err < 0) return err; } list_for_each_entry(p, &entry->children, list) { err = snd_info_register(p); if (err < 0) return err; } return 0; } EXPORT_SYMBOL(snd_info_register); /** * snd_card_rw_proc_new - Create a read/write text proc file entry for the card * @card: the card instance * @name: the file name * @private_data: the arbitrary private data * @read: the read callback * @write: the write callback, NULL for read-only * * This proc file entry will be registered via snd_card_register() call, and * it will be removed automatically at the card removal, too. * * Return: zero if successful, or a negative error code */ int snd_card_rw_proc_new(struct snd_card *card, const char *name, void *private_data, void (*read)(struct snd_info_entry *, struct snd_info_buffer *), void (*write)(struct snd_info_entry *entry, struct snd_info_buffer *buffer)) { struct snd_info_entry *entry; entry = snd_info_create_card_entry(card, name, card->proc_root); if (!entry) return -ENOMEM; snd_info_set_text_ops(entry, private_data, read); if (write) { entry->mode |= 0200; entry->c.text.write = write; } return 0; } EXPORT_SYMBOL_GPL(snd_card_rw_proc_new); /* */ static void snd_info_version_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { snd_iprintf(buffer, "Advanced Linux Sound Architecture Driver Version k%s.\n", init_utsname()->release); } static int __init snd_info_version_init(void) { struct snd_info_entry *entry; entry = snd_info_create_module_entry(THIS_MODULE, "version", NULL); if (entry == NULL) return -ENOMEM; entry->c.text.read = snd_info_version_read; return snd_info_register(entry); /* freed in error path */ }
1 1 1 7 1 1 1 1 1 14 7 7 14 14 14 7 7 14 14 4 2 7 4 4 2 5 14 1 3 7 7 7 6 1 7 7 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 // SPDX-License-Identifier: GPL-2.0+ /* * usblp.c * * Copyright (c) 1999 Michael Gee <michael@linuxspecific.com> * Copyright (c) 1999 Pavel Machek <pavel@ucw.cz> * Copyright (c) 2000 Randy Dunlap <rdunlap@xenotime.net> * Copyright (c) 2000 Vojtech Pavlik <vojtech@suse.cz> # Copyright (c) 2001 Pete Zaitcev <zaitcev@redhat.com> # Copyright (c) 2001 David Paschal <paschal@rcsis.com> * Copyright (c) 2006 Oliver Neukum <oliver@neukum.name> * * USB Printer Device Class driver for USB printers and printer cables * * Sponsored by SuSE * * ChangeLog: * v0.1 - thorough cleaning, URBification, almost a rewrite * v0.2 - some more cleanups * v0.3 - cleaner again, waitqueue fixes * v0.4 - fixes in unidirectional mode * v0.5 - add DEVICE_ID string support * v0.6 - never time out * v0.7 - fixed bulk-IN read and poll (David Paschal) * v0.8 - add devfs support * v0.9 - fix unplug-while-open paths * v0.10- remove sleep_on, fix error on oom (oliver@neukum.org) * v0.11 - add proto_bias option (Pete Zaitcev) * v0.12 - add hpoj.sourceforge.net ioctls (David Paschal) * v0.13 - alloc space for statusbuf (<status> not on stack); * use usb_alloc_coherent() for read buf & write buf; * none - Maintained in Linux kernel after v0.13 */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/minmax.h> #include <linux/sched/signal.h> #include <linux/signal.h> #include <linux/poll.h> #include <linux/slab.h> #include <linux/lp.h> #include <linux/mutex.h> #undef DEBUG #include <linux/usb.h> #include <linux/usb/ch9.h> #include <linux/ratelimit.h> /* * Version Information */ #define DRIVER_AUTHOR "Michael Gee, Pavel Machek, Vojtech Pavlik, Randy Dunlap, Pete Zaitcev, David Paschal" #define DRIVER_DESC "USB Printer Device Class driver" #define USBLP_BUF_SIZE 8192 #define USBLP_BUF_SIZE_IN 1024 #define USBLP_DEVICE_ID_SIZE 1024 /* ioctls: */ #define IOCNR_GET_DEVICE_ID 1 #define IOCNR_GET_PROTOCOLS 2 #define IOCNR_SET_PROTOCOL 3 #define IOCNR_HP_SET_CHANNEL 4 #define IOCNR_GET_BUS_ADDRESS 5 #define IOCNR_GET_VID_PID 6 #define IOCNR_SOFT_RESET 7 /* Get device_id string: */ #define LPIOC_GET_DEVICE_ID(len) _IOC(_IOC_READ, 'P', IOCNR_GET_DEVICE_ID, len) /* The following ioctls were added for http://hpoj.sourceforge.net: * Get two-int array: * [0]=current protocol * (1=USB_CLASS_PRINTER/1/1, 2=USB_CLASS_PRINTER/1/2, * 3=USB_CLASS_PRINTER/1/3), * [1]=supported protocol mask (mask&(1<<n)!=0 means * USB_CLASS_PRINTER/1/n supported): */ #define LPIOC_GET_PROTOCOLS(len) _IOC(_IOC_READ, 'P', IOCNR_GET_PROTOCOLS, len) /* * Set protocol * (arg: 1=USB_CLASS_PRINTER/1/1, 2=USB_CLASS_PRINTER/1/2, * 3=USB_CLASS_PRINTER/1/3): */ #define LPIOC_SET_PROTOCOL _IOC(_IOC_WRITE, 'P', IOCNR_SET_PROTOCOL, 0) /* Set channel number (HP Vendor-specific command): */ #define LPIOC_HP_SET_CHANNEL _IOC(_IOC_WRITE, 'P', IOCNR_HP_SET_CHANNEL, 0) /* Get two-int array: [0]=bus number, [1]=device address: */ #define LPIOC_GET_BUS_ADDRESS(len) _IOC(_IOC_READ, 'P', IOCNR_GET_BUS_ADDRESS, len) /* Get two-int array: [0]=vendor ID, [1]=product ID: */ #define LPIOC_GET_VID_PID(len) _IOC(_IOC_READ, 'P', IOCNR_GET_VID_PID, len) /* Perform class specific soft reset */ #define LPIOC_SOFT_RESET _IOC(_IOC_NONE, 'P', IOCNR_SOFT_RESET, 0) /* * A DEVICE_ID string may include the printer's serial number. * It should end with a semi-colon (';'). * An example from an HP 970C DeskJet printer is (this is one long string, * with the serial number changed): MFG:HEWLETT-PACKARD;MDL:DESKJET 970C;CMD:MLC,PCL,PML;CLASS:PRINTER;DESCRIPTION:Hewlett-Packard DeskJet 970C;SERN:US970CSEPROF;VSTATUS:$HB0$NC0,ff,DN,IDLE,CUT,K1,C0,DP,NR,KP000,CP027;VP:0800,FL,B0;VJ: ; */ /* * USB Printer Requests */ #define USBLP_REQ_GET_ID 0x00 #define USBLP_REQ_GET_STATUS 0x01 #define USBLP_REQ_RESET 0x02 #define USBLP_REQ_HP_CHANNEL_CHANGE_REQUEST 0x00 /* HP Vendor-specific */ #define USBLP_MINORS 16 #define USBLP_MINOR_BASE 0 #define USBLP_CTL_TIMEOUT 5000 /* 5 seconds */ #define USBLP_FIRST_PROTOCOL 1 #define USBLP_LAST_PROTOCOL 3 #define USBLP_MAX_PROTOCOLS (USBLP_LAST_PROTOCOL+1) /* * some arbitrary status buffer size; * need a status buffer that is allocated via kmalloc(), not on stack */ #define STATUS_BUF_SIZE 8 /* * Locks down the locking order: * ->wmut locks wstatus. * ->mut locks the whole usblp, except [rw]complete, and thus, by indirection, * [rw]status. We only touch status when we know the side idle. * ->lock locks what interrupt accesses. */ struct usblp { struct usb_device *dev; /* USB device */ struct mutex wmut; struct mutex mut; spinlock_t lock; /* locks rcomplete, wcomplete */ char *readbuf; /* read transfer_buffer */ char *statusbuf; /* status transfer_buffer */ struct usb_anchor urbs; wait_queue_head_t rwait, wwait; int readcount; /* Counter for reads */ int ifnum; /* Interface number */ struct usb_interface *intf; /* The interface */ /* * Alternate-setting numbers and endpoints for each protocol * (USB_CLASS_PRINTER/1/{index=1,2,3}) that the device supports: */ struct { int alt_setting; struct usb_endpoint_descriptor *epwrite; struct usb_endpoint_descriptor *epread; } protocol[USBLP_MAX_PROTOCOLS]; int current_protocol; int minor; /* minor number of device */ int wcomplete, rcomplete; int wstatus; /* bytes written or error */ int rstatus; /* bytes ready or error */ unsigned int quirks; /* quirks flags */ unsigned int flags; /* mode flags */ unsigned char used; /* True if open */ unsigned char present; /* True if not disconnected */ unsigned char bidir; /* interface is bidirectional */ unsigned char no_paper; /* Paper Out happened */ unsigned char *device_id_string; /* IEEE 1284 DEVICE ID string (ptr) */ /* first 2 bytes are (big-endian) length */ }; #ifdef DEBUG static void usblp_dump(struct usblp *usblp) { struct device *dev = &usblp->intf->dev; int p; dev_dbg(dev, "usblp=0x%p\n", usblp); dev_dbg(dev, "dev=0x%p\n", usblp->dev); dev_dbg(dev, "present=%d\n", usblp->present); dev_dbg(dev, "readbuf=0x%p\n", usblp->readbuf); dev_dbg(dev, "readcount=%d\n", usblp->readcount); dev_dbg(dev, "ifnum=%d\n", usblp->ifnum); for (p = USBLP_FIRST_PROTOCOL; p <= USBLP_LAST_PROTOCOL; p++) { dev_dbg(dev, "protocol[%d].alt_setting=%d\n", p, usblp->protocol[p].alt_setting); dev_dbg(dev, "protocol[%d].epwrite=%p\n", p, usblp->protocol[p].epwrite); dev_dbg(dev, "protocol[%d].epread=%p\n", p, usblp->protocol[p].epread); } dev_dbg(dev, "current_protocol=%d\n", usblp->current_protocol); dev_dbg(dev, "minor=%d\n", usblp->minor); dev_dbg(dev, "wstatus=%d\n", usblp->wstatus); dev_dbg(dev, "rstatus=%d\n", usblp->rstatus); dev_dbg(dev, "quirks=%d\n", usblp->quirks); dev_dbg(dev, "used=%d\n", usblp->used); dev_dbg(dev, "bidir=%d\n", usblp->bidir); dev_dbg(dev, "device_id_string=\"%s\"\n", usblp->device_id_string ? usblp->device_id_string + 2 : (unsigned char *)"(null)"); } #endif /* Quirks: various printer quirks are handled by this table & its flags. */ struct quirk_printer_struct { __u16 vendorId; __u16 productId; unsigned int quirks; }; #define USBLP_QUIRK_BIDIR 0x1 /* reports bidir but requires unidirectional mode (no INs/reads) */ #define USBLP_QUIRK_USB_INIT 0x2 /* needs vendor USB init string */ #define USBLP_QUIRK_BAD_CLASS 0x4 /* descriptor uses vendor-specific Class or SubClass */ static const struct quirk_printer_struct quirk_printers[] = { { 0x03f0, 0x0004, USBLP_QUIRK_BIDIR }, /* HP DeskJet 895C */ { 0x03f0, 0x0104, USBLP_QUIRK_BIDIR }, /* HP DeskJet 880C */ { 0x03f0, 0x0204, USBLP_QUIRK_BIDIR }, /* HP DeskJet 815C */ { 0x03f0, 0x0304, USBLP_QUIRK_BIDIR }, /* HP DeskJet 810C/812C */ { 0x03f0, 0x0404, USBLP_QUIRK_BIDIR }, /* HP DeskJet 830C */ { 0x03f0, 0x0504, USBLP_QUIRK_BIDIR }, /* HP DeskJet 885C */ { 0x03f0, 0x0604, USBLP_QUIRK_BIDIR }, /* HP DeskJet 840C */ { 0x03f0, 0x0804, USBLP_QUIRK_BIDIR }, /* HP DeskJet 816C */ { 0x03f0, 0x1104, USBLP_QUIRK_BIDIR }, /* HP Deskjet 959C */ { 0x0409, 0xefbe, USBLP_QUIRK_BIDIR }, /* NEC Picty900 (HP OEM) */ { 0x0409, 0xbef4, USBLP_QUIRK_BIDIR }, /* NEC Picty760 (HP OEM) */ { 0x0409, 0xf0be, USBLP_QUIRK_BIDIR }, /* NEC Picty920 (HP OEM) */ { 0x0409, 0xf1be, USBLP_QUIRK_BIDIR }, /* NEC Picty800 (HP OEM) */ { 0x0482, 0x0010, USBLP_QUIRK_BIDIR }, /* Kyocera Mita FS 820, by zut <kernel@zut.de> */ { 0x04f9, 0x000d, USBLP_QUIRK_BIDIR }, /* Brother Industries, Ltd HL-1440 Laser Printer */ { 0x04b8, 0x0202, USBLP_QUIRK_BAD_CLASS }, /* Seiko Epson Receipt Printer M129C */ { 0, 0 } }; static int usblp_wwait(struct usblp *usblp, int nonblock); static int usblp_wtest(struct usblp *usblp, int nonblock); static int usblp_rwait_and_lock(struct usblp *usblp, int nonblock); static int usblp_rtest(struct usblp *usblp, int nonblock); static int usblp_submit_read(struct usblp *usblp); static int usblp_select_alts(struct usblp *usblp); static int usblp_set_protocol(struct usblp *usblp, int protocol); static int usblp_cache_device_id_string(struct usblp *usblp); /* forward reference to make our lives easier */ static struct usb_driver usblp_driver; static DEFINE_MUTEX(usblp_mutex); /* locks the existence of usblp's */ /* * Functions for usblp control messages. */ static int usblp_ctrl_msg(struct usblp *usblp, int request, int type, int dir, int recip, int value, void *buf, int len) { int retval; int index = usblp->ifnum; /* High byte has the interface index. Low byte has the alternate setting. */ if ((request == USBLP_REQ_GET_ID) && (type == USB_TYPE_CLASS)) index = (usblp->ifnum<<8)|usblp->protocol[usblp->current_protocol].alt_setting; retval = usb_control_msg(usblp->dev, dir ? usb_rcvctrlpipe(usblp->dev, 0) : usb_sndctrlpipe(usblp->dev, 0), request, type | dir | recip, value, index, buf, len, USBLP_CTL_TIMEOUT); dev_dbg(&usblp->intf->dev, "usblp_control_msg: rq: 0x%02x dir: %d recip: %d value: %d idx: %d len: %#x result: %d\n", request, !!dir, recip, value, index, len, retval); return retval < 0 ? retval : 0; } #define usblp_read_status(usblp, status)\ usblp_ctrl_msg(usblp, USBLP_REQ_GET_STATUS, USB_TYPE_CLASS, USB_DIR_IN, USB_RECIP_INTERFACE, 0, status, 1) #define usblp_get_id(usblp, config, id, maxlen)\ usblp_ctrl_msg(usblp, USBLP_REQ_GET_ID, USB_TYPE_CLASS, USB_DIR_IN, USB_RECIP_INTERFACE, config, id, maxlen) #define usblp_reset(usblp)\ usblp_ctrl_msg(usblp, USBLP_REQ_RESET, USB_TYPE_CLASS, USB_DIR_OUT, USB_RECIP_OTHER, 0, NULL, 0) static int usblp_hp_channel_change_request(struct usblp *usblp, int channel, u8 *new_channel) { u8 *buf; int ret; buf = kzalloc(1, GFP_KERNEL); if (!buf) return -ENOMEM; ret = usblp_ctrl_msg(usblp, USBLP_REQ_HP_CHANNEL_CHANGE_REQUEST, USB_TYPE_VENDOR, USB_DIR_IN, USB_RECIP_INTERFACE, channel, buf, 1); if (ret == 0) *new_channel = buf[0]; kfree(buf); return ret; } /* * See the description for usblp_select_alts() below for the usage * explanation. Look into your /sys/kernel/debug/usb/devices and dmesg in * case of any trouble. */ static int proto_bias = -1; /* * URB callback. */ static void usblp_bulk_read(struct urb *urb) { struct usblp *usblp = urb->context; int status = urb->status; unsigned long flags; if (usblp->present && usblp->used) { if (status) printk(KERN_WARNING "usblp%d: " "nonzero read bulk status received: %d\n", usblp->minor, status); } spin_lock_irqsave(&usblp->lock, flags); if (status < 0) usblp->rstatus = status; else usblp->rstatus = urb->actual_length; usblp->rcomplete = 1; wake_up(&usblp->rwait); spin_unlock_irqrestore(&usblp->lock, flags); usb_free_urb(urb); } static void usblp_bulk_write(struct urb *urb) { struct usblp *usblp = urb->context; int status = urb->status; unsigned long flags; if (usblp->present && usblp->used) { if (status) printk(KERN_WARNING "usblp%d: " "nonzero write bulk status received: %d\n", usblp->minor, status); } spin_lock_irqsave(&usblp->lock, flags); if (status < 0) usblp->wstatus = status; else usblp->wstatus = urb->actual_length; usblp->no_paper = 0; usblp->wcomplete = 1; wake_up(&usblp->wwait); spin_unlock_irqrestore(&usblp->lock, flags); usb_free_urb(urb); } /* * Get and print printer errors. */ static const char *usblp_messages[] = { "ok", "out of paper", "off-line", "on fire" }; static int usblp_check_status(struct usblp *usblp, int err) { unsigned char status, newerr = 0; int error; mutex_lock(&usblp->mut); error = usblp_read_status(usblp, usblp->statusbuf); if (error < 0) { mutex_unlock(&usblp->mut); printk_ratelimited(KERN_ERR "usblp%d: error %d reading printer status\n", usblp->minor, error); return 0; } status = *usblp->statusbuf; mutex_unlock(&usblp->mut); if (~status & LP_PERRORP) newerr = 3; if (status & LP_POUTPA) newerr = 1; if (~status & LP_PSELECD) newerr = 2; if (newerr != err) { printk(KERN_INFO "usblp%d: %s\n", usblp->minor, usblp_messages[newerr]); } return newerr; } static int handle_bidir(struct usblp *usblp) { if (usblp->bidir && usblp->used) { if (usblp_submit_read(usblp) < 0) return -EIO; } return 0; } /* * File op functions. */ static int usblp_open(struct inode *inode, struct file *file) { int minor = iminor(inode); struct usblp *usblp; struct usb_interface *intf; int retval; if (minor < 0) return -ENODEV; mutex_lock(&usblp_mutex); retval = -ENODEV; intf = usb_find_interface(&usblp_driver, minor); if (!intf) goto out; usblp = usb_get_intfdata(intf); if (!usblp || !usblp->dev || !usblp->present) goto out; retval = -EBUSY; if (usblp->used) goto out; /* * We do not implement LP_ABORTOPEN/LPABORTOPEN for two reasons: * - We do not want persistent state which close(2) does not clear * - It is not used anyway, according to CUPS people */ retval = usb_autopm_get_interface(intf); if (retval < 0) goto out; usblp->used = 1; file->private_data = usblp; usblp->wcomplete = 1; /* we begin writeable */ usblp->wstatus = 0; usblp->rcomplete = 0; if (handle_bidir(usblp) < 0) { usb_autopm_put_interface(intf); usblp->used = 0; file->private_data = NULL; retval = -EIO; } out: mutex_unlock(&usblp_mutex); return retval; } static void usblp_cleanup(struct usblp *usblp) { printk(KERN_INFO "usblp%d: removed\n", usblp->minor); kfree(usblp->readbuf); kfree(usblp->device_id_string); kfree(usblp->statusbuf); usb_put_intf(usblp->intf); kfree(usblp); } static void usblp_unlink_urbs(struct usblp *usblp) { usb_kill_anchored_urbs(&usblp->urbs); } static int usblp_release(struct inode *inode, struct file *file) { struct usblp *usblp = file->private_data; usblp->flags &= ~LP_ABORT; mutex_lock(&usblp_mutex); usblp->used = 0; if (usblp->present) usblp_unlink_urbs(usblp); usb_autopm_put_interface(usblp->intf); if (!usblp->present) /* finish cleanup from disconnect */ usblp_cleanup(usblp); /* any URBs must be dead */ mutex_unlock(&usblp_mutex); return 0; } /* No kernel lock - fine */ static __poll_t usblp_poll(struct file *file, struct poll_table_struct *wait) { struct usblp *usblp = file->private_data; __poll_t ret = 0; unsigned long flags; /* Should we check file->f_mode & FMODE_WRITE before poll_wait()? */ poll_wait(file, &usblp->rwait, wait); poll_wait(file, &usblp->wwait, wait); mutex_lock(&usblp->mut); if (!usblp->present) ret |= EPOLLHUP; mutex_unlock(&usblp->mut); spin_lock_irqsave(&usblp->lock, flags); if (usblp->bidir && usblp->rcomplete) ret |= EPOLLIN | EPOLLRDNORM; if (usblp->no_paper || usblp->wcomplete) ret |= EPOLLOUT | EPOLLWRNORM; spin_unlock_irqrestore(&usblp->lock, flags); return ret; } static long usblp_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct usblp *usblp = file->private_data; int length, err, i; unsigned char newChannel; int status; int twoints[2]; int retval = 0; mutex_lock(&usblp->mut); if (!usblp->present) { retval = -ENODEV; goto done; } dev_dbg(&usblp->intf->dev, "usblp_ioctl: cmd=0x%x (%c nr=%d len=%d dir=%d)\n", cmd, _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd), _IOC_DIR(cmd)); if (_IOC_TYPE(cmd) == 'P') /* new-style ioctl number */ switch (_IOC_NR(cmd)) { case IOCNR_GET_DEVICE_ID: /* get the DEVICE_ID string */ if (_IOC_DIR(cmd) != _IOC_READ) { retval = -EINVAL; goto done; } length = usblp_cache_device_id_string(usblp); if (length < 0) { retval = length; goto done; } if (length > _IOC_SIZE(cmd)) length = _IOC_SIZE(cmd); /* truncate */ if (copy_to_user((void __user *) arg, usblp->device_id_string, (unsigned long) length)) { retval = -EFAULT; goto done; } break; case IOCNR_GET_PROTOCOLS: if (_IOC_DIR(cmd) != _IOC_READ || _IOC_SIZE(cmd) < sizeof(twoints)) { retval = -EINVAL; goto done; } twoints[0] = usblp->current_protocol; twoints[1] = 0; for (i = USBLP_FIRST_PROTOCOL; i <= USBLP_LAST_PROTOCOL; i++) { if (usblp->protocol[i].alt_setting >= 0) twoints[1] |= (1<<i); } if (copy_to_user((void __user *)arg, (unsigned char *)twoints, sizeof(twoints))) { retval = -EFAULT; goto done; } break; case IOCNR_SET_PROTOCOL: if (_IOC_DIR(cmd) != _IOC_WRITE) { retval = -EINVAL; goto done; } #ifdef DEBUG if (arg == -10) { usblp_dump(usblp); break; } #endif usblp_unlink_urbs(usblp); retval = usblp_set_protocol(usblp, arg); if (retval < 0) { usblp_set_protocol(usblp, usblp->current_protocol); } break; case IOCNR_HP_SET_CHANNEL: if (_IOC_DIR(cmd) != _IOC_WRITE || le16_to_cpu(usblp->dev->descriptor.idVendor) != 0x03F0 || usblp->quirks & USBLP_QUIRK_BIDIR) { retval = -EINVAL; goto done; } err = usblp_hp_channel_change_request(usblp, arg, &newChannel); if (err < 0) { dev_err(&usblp->dev->dev, "usblp%d: error = %d setting " "HP channel\n", usblp->minor, err); retval = -EIO; goto done; } dev_dbg(&usblp->intf->dev, "usblp%d requested/got HP channel %ld/%d\n", usblp->minor, arg, newChannel); break; case IOCNR_GET_BUS_ADDRESS: if (_IOC_DIR(cmd) != _IOC_READ || _IOC_SIZE(cmd) < sizeof(twoints)) { retval = -EINVAL; goto done; } twoints[0] = usblp->dev->bus->busnum; twoints[1] = usblp->dev->devnum; if (copy_to_user((void __user *)arg, (unsigned char *)twoints, sizeof(twoints))) { retval = -EFAULT; goto done; } dev_dbg(&usblp->intf->dev, "usblp%d is bus=%d, device=%d\n", usblp->minor, twoints[0], twoints[1]); break; case IOCNR_GET_VID_PID: if (_IOC_DIR(cmd) != _IOC_READ || _IOC_SIZE(cmd) < sizeof(twoints)) { retval = -EINVAL; goto done; } twoints[0] = le16_to_cpu(usblp->dev->descriptor.idVendor); twoints[1] = le16_to_cpu(usblp->dev->descriptor.idProduct); if (copy_to_user((void __user *)arg, (unsigned char *)twoints, sizeof(twoints))) { retval = -EFAULT; goto done; } dev_dbg(&usblp->intf->dev, "usblp%d is VID=0x%4.4X, PID=0x%4.4X\n", usblp->minor, twoints[0], twoints[1]); break; case IOCNR_SOFT_RESET: if (_IOC_DIR(cmd) != _IOC_NONE) { retval = -EINVAL; goto done; } retval = usblp_reset(usblp); break; default: retval = -ENOTTY; } else /* old-style ioctl value */ switch (cmd) { case LPGETSTATUS: retval = usblp_read_status(usblp, usblp->statusbuf); if (retval) { printk_ratelimited(KERN_ERR "usblp%d:" "failed reading printer status (%d)\n", usblp->minor, retval); retval = -EIO; goto done; } status = *usblp->statusbuf; if (copy_to_user((void __user *)arg, &status, sizeof(int))) retval = -EFAULT; break; case LPABORT: if (arg) usblp->flags |= LP_ABORT; else usblp->flags &= ~LP_ABORT; break; default: retval = -ENOTTY; } done: mutex_unlock(&usblp->mut); return retval; } static struct urb *usblp_new_writeurb(struct usblp *usblp, int transfer_length) { struct urb *urb; char *writebuf; writebuf = kmalloc(transfer_length, GFP_KERNEL); if (writebuf == NULL) return NULL; urb = usb_alloc_urb(0, GFP_KERNEL); if (urb == NULL) { kfree(writebuf); return NULL; } usb_fill_bulk_urb(urb, usblp->dev, usb_sndbulkpipe(usblp->dev, usblp->protocol[usblp->current_protocol].epwrite->bEndpointAddress), writebuf, transfer_length, usblp_bulk_write, usblp); urb->transfer_flags |= URB_FREE_BUFFER; return urb; } static ssize_t usblp_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { struct usblp *usblp = file->private_data; struct urb *writeurb; int rv; int transfer_length; ssize_t writecount = 0; if (mutex_lock_interruptible(&usblp->wmut)) { rv = -EINTR; goto raise_biglock; } rv = usblp_wwait(usblp, !!(file->f_flags & O_NONBLOCK)); if (rv < 0) goto raise_wait; while (writecount < count) { /* * Step 1: Submit next block. */ transfer_length = count - writecount; if (transfer_length > USBLP_BUF_SIZE) transfer_length = USBLP_BUF_SIZE; rv = -ENOMEM; writeurb = usblp_new_writeurb(usblp, transfer_length); if (writeurb == NULL) goto raise_urb; usb_anchor_urb(writeurb, &usblp->urbs); if (copy_from_user(writeurb->transfer_buffer, buffer + writecount, transfer_length)) { rv = -EFAULT; goto raise_badaddr; } spin_lock_irq(&usblp->lock); usblp->wcomplete = 0; spin_unlock_irq(&usblp->lock); rv = usb_submit_urb(writeurb, GFP_KERNEL); if (rv < 0) { usblp->wstatus = 0; spin_lock_irq(&usblp->lock); usblp->no_paper = 0; usblp->wcomplete = 1; wake_up(&usblp->wwait); spin_unlock_irq(&usblp->lock); if (rv != -ENOMEM) rv = -EIO; goto raise_submit; } /* * Step 2: Wait for transfer to end, collect results. */ rv = usblp_wwait(usblp, !!(file->f_flags&O_NONBLOCK)); if (rv < 0) { if (rv == -EAGAIN) { /* Presume that it's going to complete well. */ writecount += transfer_length; } if (rv == -ENOSPC) { spin_lock_irq(&usblp->lock); usblp->no_paper = 1; /* Mark for poll(2) */ spin_unlock_irq(&usblp->lock); writecount += transfer_length; } /* Leave URB dangling, to be cleaned on close. */ goto collect_error; } if (usblp->wstatus < 0) { rv = -EIO; goto collect_error; } /* * This is critical: it must be our URB, not other writer's. * The wmut exists mainly to cover us here. */ writecount += usblp->wstatus; } mutex_unlock(&usblp->wmut); return writecount; raise_submit: raise_badaddr: usb_unanchor_urb(writeurb); usb_free_urb(writeurb); raise_urb: raise_wait: collect_error: /* Out of raise sequence */ mutex_unlock(&usblp->wmut); raise_biglock: return writecount ? writecount : rv; } /* * Notice that we fail to restart in a few cases: on EFAULT, on restart * error, etc. This is the historical behaviour. In all such cases we return * EIO, and applications loop in order to get the new read going. */ static ssize_t usblp_read(struct file *file, char __user *buffer, size_t len, loff_t *ppos) { struct usblp *usblp = file->private_data; ssize_t count; ssize_t avail; int rv; if (!usblp->bidir) return -EINVAL; rv = usblp_rwait_and_lock(usblp, !!(file->f_flags & O_NONBLOCK)); if (rv < 0) return rv; if (!usblp->present) { count = -ENODEV; goto done; } avail = usblp->rstatus; if (avail < 0) { printk(KERN_ERR "usblp%d: error %d reading from printer\n", usblp->minor, (int)avail); usblp_submit_read(usblp); count = -EIO; goto done; } count = min_t(ssize_t, len, avail - usblp->readcount); if (count != 0 && copy_to_user(buffer, usblp->readbuf + usblp->readcount, count)) { count = -EFAULT; goto done; } usblp->readcount += count; if (usblp->readcount == avail) { if (usblp_submit_read(usblp) < 0) { /* We don't want to leak USB return codes into errno. */ if (count == 0) count = -EIO; goto done; } } done: mutex_unlock(&usblp->mut); return count; } /* * Wait for the write path to come idle. * This is called under the ->wmut, so the idle path stays idle. * * Our write path has a peculiar property: it does not buffer like a tty, * but waits for the write to succeed. This allows our ->release to bug out * without waiting for writes to drain. But it obviously does not work * when O_NONBLOCK is set. So, applications setting O_NONBLOCK must use * select(2) or poll(2) to wait for the buffer to drain before closing. * Alternatively, set blocking mode with fcntl and issue a zero-size write. */ static int usblp_wwait(struct usblp *usblp, int nonblock) { DECLARE_WAITQUEUE(waita, current); int rc; int err = 0; add_wait_queue(&usblp->wwait, &waita); for (;;) { if (mutex_lock_interruptible(&usblp->mut)) { rc = -EINTR; break; } set_current_state(TASK_INTERRUPTIBLE); rc = usblp_wtest(usblp, nonblock); mutex_unlock(&usblp->mut); if (rc <= 0) break; if (schedule_timeout(msecs_to_jiffies(1500)) == 0) { if (usblp->flags & LP_ABORT) { err = usblp_check_status(usblp, err); if (err == 1) { /* Paper out */ rc = -ENOSPC; break; } } else { /* Prod the printer, Gentoo#251237. */ mutex_lock(&usblp->mut); usblp_read_status(usblp, usblp->statusbuf); mutex_unlock(&usblp->mut); } } } set_current_state(TASK_RUNNING); remove_wait_queue(&usblp->wwait, &waita); return rc; } static int usblp_wtest(struct usblp *usblp, int nonblock) { unsigned long flags; if (!usblp->present) return -ENODEV; if (signal_pending(current)) return -EINTR; spin_lock_irqsave(&usblp->lock, flags); if (usblp->wcomplete) { spin_unlock_irqrestore(&usblp->lock, flags); return 0; } spin_unlock_irqrestore(&usblp->lock, flags); if (nonblock) return -EAGAIN; return 1; } /* * Wait for read bytes to become available. This probably should have been * called usblp_r_lock_and_wait(), because we lock first. But it's a traditional * name for functions which lock and return. * * We do not use wait_event_interruptible because it makes locking iffy. */ static int usblp_rwait_and_lock(struct usblp *usblp, int nonblock) { DECLARE_WAITQUEUE(waita, current); int rc; add_wait_queue(&usblp->rwait, &waita); for (;;) { if (mutex_lock_interruptible(&usblp->mut)) { rc = -EINTR; break; } set_current_state(TASK_INTERRUPTIBLE); rc = usblp_rtest(usblp, nonblock); if (rc < 0) { mutex_unlock(&usblp->mut); break; } if (rc == 0) /* Keep it locked */ break; mutex_unlock(&usblp->mut); schedule(); } set_current_state(TASK_RUNNING); remove_wait_queue(&usblp->rwait, &waita); return rc; } static int usblp_rtest(struct usblp *usblp, int nonblock) { unsigned long flags; if (!usblp->present) return -ENODEV; if (signal_pending(current)) return -EINTR; spin_lock_irqsave(&usblp->lock, flags); if (usblp->rcomplete) { spin_unlock_irqrestore(&usblp->lock, flags); return 0; } spin_unlock_irqrestore(&usblp->lock, flags); if (nonblock) return -EAGAIN; return 1; } /* * Please check ->bidir and other such things outside for now. */ static int usblp_submit_read(struct usblp *usblp) { struct urb *urb; unsigned long flags; int rc; rc = -ENOMEM; urb = usb_alloc_urb(0, GFP_KERNEL); if (urb == NULL) goto raise_urb; usb_fill_bulk_urb(urb, usblp->dev, usb_rcvbulkpipe(usblp->dev, usblp->protocol[usblp->current_protocol].epread->bEndpointAddress), usblp->readbuf, USBLP_BUF_SIZE_IN, usblp_bulk_read, usblp); usb_anchor_urb(urb, &usblp->urbs); spin_lock_irqsave(&usblp->lock, flags); usblp->readcount = 0; /* XXX Why here? */ usblp->rcomplete = 0; spin_unlock_irqrestore(&usblp->lock, flags); rc = usb_submit_urb(urb, GFP_KERNEL); if (rc < 0) { dev_dbg(&usblp->intf->dev, "error submitting urb (%d)\n", rc); spin_lock_irqsave(&usblp->lock, flags); usblp->rstatus = rc; usblp->rcomplete = 1; spin_unlock_irqrestore(&usblp->lock, flags); goto raise_submit; } return 0; raise_submit: usb_unanchor_urb(urb); usb_free_urb(urb); raise_urb: return rc; } /* * Checks for printers that have quirks, such as requiring unidirectional * communication but reporting bidirectional; currently some HP printers * have this flaw (HP 810, 880, 895, etc.), or needing an init string * sent at each open (like some Epsons). * Returns 1 if found, 0 if not found. * * HP recommended that we use the bidirectional interface but * don't attempt any bulk IN transfers from the IN endpoint. * Here's some more detail on the problem: * The problem is not that it isn't bidirectional though. The problem * is that if you request a device ID, or status information, while * the buffers are full, the return data will end up in the print data * buffer. For example if you make sure you never request the device ID * while you are sending print data, and you don't try to query the * printer status every couple of milliseconds, you will probably be OK. */ static unsigned int usblp_quirks(__u16 vendor, __u16 product) { int i; for (i = 0; quirk_printers[i].vendorId; i++) { if (vendor == quirk_printers[i].vendorId && product == quirk_printers[i].productId) return quirk_printers[i].quirks; } return 0; } static const struct file_operations usblp_fops = { .owner = THIS_MODULE, .read = usblp_read, .write = usblp_write, .poll = usblp_poll, .unlocked_ioctl = usblp_ioctl, .compat_ioctl = usblp_ioctl, .open = usblp_open, .release = usblp_release, .llseek = noop_llseek, }; static char *usblp_devnode(const struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "usb/%s", dev_name(dev)); } static struct usb_class_driver usblp_class = { .name = "lp%d", .devnode = usblp_devnode, .fops = &usblp_fops, .minor_base = USBLP_MINOR_BASE, }; static ssize_t ieee1284_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_interface *intf = to_usb_interface(dev); struct usblp *usblp = usb_get_intfdata(intf); if (usblp->device_id_string[0] == 0 && usblp->device_id_string[1] == 0) return 0; return sprintf(buf, "%s", usblp->device_id_string+2); } static DEVICE_ATTR_RO(ieee1284_id); static struct attribute *usblp_attrs[] = { &dev_attr_ieee1284_id.attr, NULL, }; ATTRIBUTE_GROUPS(usblp); static int usblp_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_device *dev = interface_to_usbdev(intf); struct usblp *usblp; int protocol; int retval; /* Malloc and start initializing usblp structure so we can use it * directly. */ usblp = kzalloc(sizeof(struct usblp), GFP_KERNEL); if (!usblp) { retval = -ENOMEM; goto abort_ret; } usblp->dev = dev; mutex_init(&usblp->wmut); mutex_init(&usblp->mut); spin_lock_init(&usblp->lock); init_waitqueue_head(&usblp->rwait); init_waitqueue_head(&usblp->wwait); init_usb_anchor(&usblp->urbs); usblp->ifnum = intf->cur_altsetting->desc.bInterfaceNumber; usblp->intf = usb_get_intf(intf); /* Malloc device ID string buffer to the largest expected length, * since we can re-query it on an ioctl and a dynamic string * could change in length. */ usblp->device_id_string = kmalloc(USBLP_DEVICE_ID_SIZE, GFP_KERNEL); if (!usblp->device_id_string) { retval = -ENOMEM; goto abort; } /* * Allocate read buffer. We somewhat wastefully * malloc both regardless of bidirectionality, because the * alternate setting can be changed later via an ioctl. */ usblp->readbuf = kmalloc(USBLP_BUF_SIZE_IN, GFP_KERNEL); if (!usblp->readbuf) { retval = -ENOMEM; goto abort; } /* Allocate buffer for printer status */ usblp->statusbuf = kmalloc(STATUS_BUF_SIZE, GFP_KERNEL); if (!usblp->statusbuf) { retval = -ENOMEM; goto abort; } /* Lookup quirks for this printer. */ usblp->quirks = usblp_quirks( le16_to_cpu(dev->descriptor.idVendor), le16_to_cpu(dev->descriptor.idProduct)); /* Analyze and pick initial alternate settings and endpoints. */ protocol = usblp_select_alts(usblp); if (protocol < 0) { dev_dbg(&intf->dev, "incompatible printer-class device 0x%4.4X/0x%4.4X\n", le16_to_cpu(dev->descriptor.idVendor), le16_to_cpu(dev->descriptor.idProduct)); retval = -ENODEV; goto abort; } /* Setup the selected alternate setting and endpoints. */ if (usblp_set_protocol(usblp, protocol) < 0) { retval = -ENODEV; /* ->probe isn't ->ioctl */ goto abort; } /* Retrieve and store the device ID string. */ usblp_cache_device_id_string(usblp); #ifdef DEBUG usblp_check_status(usblp, 0); #endif usb_set_intfdata(intf, usblp); usblp->present = 1; retval = usb_register_dev(intf, &usblp_class); if (retval) { dev_err(&intf->dev, "usblp: Not able to get a minor (base %u, slice default): %d\n", USBLP_MINOR_BASE, retval); goto abort_intfdata; } usblp->minor = intf->minor; dev_info(&intf->dev, "usblp%d: USB %sdirectional printer dev %d if %d alt %d proto %d vid 0x%4.4X pid 0x%4.4X\n", usblp->minor, usblp->bidir ? "Bi" : "Uni", dev->devnum, usblp->ifnum, usblp->protocol[usblp->current_protocol].alt_setting, usblp->current_protocol, le16_to_cpu(usblp->dev->descriptor.idVendor), le16_to_cpu(usblp->dev->descriptor.idProduct)); return 0; abort_intfdata: usb_set_intfdata(intf, NULL); abort: kfree(usblp->readbuf); kfree(usblp->statusbuf); kfree(usblp->device_id_string); usb_put_intf(usblp->intf); kfree(usblp); abort_ret: return retval; } /* * We are a "new" style driver with usb_device_id table, * but our requirements are too intricate for simple match to handle. * * The "proto_bias" option may be used to specify the preferred protocol * for all USB printers (1=USB_CLASS_PRINTER/1/1, 2=USB_CLASS_PRINTER/1/2, * 3=USB_CLASS_PRINTER/1/3). If the device supports the preferred protocol, * then we bind to it. * * The best interface for us is USB_CLASS_PRINTER/1/2, because it * is compatible with a stream of characters. If we find it, we bind to it. * * Note that the people from hpoj.sourceforge.net need to be able to * bind to USB_CLASS_PRINTER/1/3 (MLC/1284.4), so we provide them ioctls * for this purpose. * * Failing USB_CLASS_PRINTER/1/2, we look for USB_CLASS_PRINTER/1/3, * even though it's probably not stream-compatible, because this matches * the behaviour of the old code. * * If nothing else, we bind to USB_CLASS_PRINTER/1/1 * - the unidirectional interface. */ static int usblp_select_alts(struct usblp *usblp) { struct usb_interface *if_alt; struct usb_host_interface *ifd; struct usb_endpoint_descriptor *epwrite, *epread; int p, i; int res; if_alt = usblp->intf; for (p = 0; p < USBLP_MAX_PROTOCOLS; p++) usblp->protocol[p].alt_setting = -1; /* Find out what we have. */ for (i = 0; i < if_alt->num_altsetting; i++) { ifd = &if_alt->altsetting[i]; if (ifd->desc.bInterfaceClass != USB_CLASS_PRINTER || ifd->desc.bInterfaceSubClass != 1) if (!(usblp->quirks & USBLP_QUIRK_BAD_CLASS)) continue; if (ifd->desc.bInterfaceProtocol < USBLP_FIRST_PROTOCOL || ifd->desc.bInterfaceProtocol > USBLP_LAST_PROTOCOL) continue; /* Look for the expected bulk endpoints. */ if (ifd->desc.bInterfaceProtocol > 1) { res = usb_find_common_endpoints(ifd, &epread, &epwrite, NULL, NULL); } else { epread = NULL; res = usb_find_bulk_out_endpoint(ifd, &epwrite); } /* Ignore buggy hardware without the right endpoints. */ if (res) continue; /* Turn off reads for buggy bidirectional printers. */ if (usblp->quirks & USBLP_QUIRK_BIDIR) { printk(KERN_INFO "usblp%d: Disabling reads from " "problematic bidirectional printer\n", usblp->minor); epread = NULL; } usblp->protocol[ifd->desc.bInterfaceProtocol].alt_setting = ifd->desc.bAlternateSetting; usblp->protocol[ifd->desc.bInterfaceProtocol].epwrite = epwrite; usblp->protocol[ifd->desc.bInterfaceProtocol].epread = epread; } /* If our requested protocol is supported, then use it. */ if (proto_bias >= USBLP_FIRST_PROTOCOL && proto_bias <= USBLP_LAST_PROTOCOL && usblp->protocol[proto_bias].alt_setting != -1) return proto_bias; /* Ordering is important here. */ if (usblp->protocol[2].alt_setting != -1) return 2; if (usblp->protocol[1].alt_setting != -1) return 1; if (usblp->protocol[3].alt_setting != -1) return 3; /* If nothing is available, then don't bind to this device. */ return -1; } static int usblp_set_protocol(struct usblp *usblp, int protocol) { int r, alts; if (protocol < USBLP_FIRST_PROTOCOL || protocol > USBLP_LAST_PROTOCOL) return -EINVAL; alts = usblp->protocol[protocol].alt_setting; if (alts < 0) return -EINVAL; /* Don't unnecessarily set the interface if there's a single alt. */ if (usblp->intf->num_altsetting > 1) { r = usb_set_interface(usblp->dev, usblp->ifnum, alts); if (r < 0) { printk(KERN_ERR "usblp: can't set desired altsetting %d on interface %d\n", alts, usblp->ifnum); return r; } } usblp->bidir = (usblp->protocol[protocol].epread != NULL); usblp->current_protocol = protocol; dev_dbg(&usblp->intf->dev, "usblp%d set protocol %d\n", usblp->minor, protocol); return 0; } /* Retrieves and caches device ID string. * Returns length, including length bytes but not null terminator. * On error, returns a negative errno value. */ static int usblp_cache_device_id_string(struct usblp *usblp) { int err, length; err = usblp_get_id(usblp, 0, usblp->device_id_string, USBLP_DEVICE_ID_SIZE - 1); if (err < 0) { dev_dbg(&usblp->intf->dev, "usblp%d: error = %d reading IEEE-1284 Device ID string\n", usblp->minor, err); usblp->device_id_string[0] = usblp->device_id_string[1] = '\0'; return -EIO; } /* First two bytes are length in big-endian. * They count themselves, and we copy them into * the user's buffer. */ length = be16_to_cpu(*((__be16 *)usblp->device_id_string)); if (length < 2) length = 2; else if (length >= USBLP_DEVICE_ID_SIZE) length = USBLP_DEVICE_ID_SIZE - 1; usblp->device_id_string[length] = '\0'; dev_dbg(&usblp->intf->dev, "usblp%d Device ID string [len=%d]=\"%s\"\n", usblp->minor, length, &usblp->device_id_string[2]); return length; } static void usblp_disconnect(struct usb_interface *intf) { struct usblp *usblp = usb_get_intfdata(intf); usb_deregister_dev(intf, &usblp_class); if (!usblp || !usblp->dev) { dev_err(&intf->dev, "bogus disconnect\n"); BUG(); } mutex_lock(&usblp_mutex); mutex_lock(&usblp->mut); usblp->present = 0; wake_up(&usblp->wwait); wake_up(&usblp->rwait); usb_set_intfdata(intf, NULL); usblp_unlink_urbs(usblp); mutex_unlock(&usblp->mut); usb_poison_anchored_urbs(&usblp->urbs); if (!usblp->used) usblp_cleanup(usblp); mutex_unlock(&usblp_mutex); } static int usblp_suspend(struct usb_interface *intf, pm_message_t message) { struct usblp *usblp = usb_get_intfdata(intf); usblp_unlink_urbs(usblp); #if 0 /* XXX Do we want this? What if someone is reading, should we fail? */ /* not strictly necessary, but just in case */ wake_up(&usblp->wwait); wake_up(&usblp->rwait); #endif return 0; } static int usblp_resume(struct usb_interface *intf) { struct usblp *usblp = usb_get_intfdata(intf); int r; r = handle_bidir(usblp); return r; } static const struct usb_device_id usblp_ids[] = { { USB_DEVICE_INFO(USB_CLASS_PRINTER, 1, 1) }, { USB_DEVICE_INFO(USB_CLASS_PRINTER, 1, 2) }, { USB_DEVICE_INFO(USB_CLASS_PRINTER, 1, 3) }, { USB_INTERFACE_INFO(USB_CLASS_PRINTER, 1, 1) }, { USB_INTERFACE_INFO(USB_CLASS_PRINTER, 1, 2) }, { USB_INTERFACE_INFO(USB_CLASS_PRINTER, 1, 3) }, { USB_DEVICE(0x04b8, 0x0202) }, /* Seiko Epson Receipt Printer M129C */ { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, usblp_ids); static struct usb_driver usblp_driver = { .name = "usblp", .probe = usblp_probe, .disconnect = usblp_disconnect, .suspend = usblp_suspend, .resume = usblp_resume, .id_table = usblp_ids, .dev_groups = usblp_groups, .supports_autosuspend = 1, }; module_usb_driver(usblp_driver); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); module_param(proto_bias, int, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(proto_bias, "Favourite protocol number"); MODULE_LICENSE("GPL");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2006, Intel Corporation. * * Copyright (C) 2006-2008 Intel Corporation * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com> */ #ifndef _IOVA_H_ #define _IOVA_H_ #include <linux/types.h> #include <linux/kernel.h> #include <linux/rbtree.h> #include <linux/dma-mapping.h> /* iova structure */ struct iova { struct rb_node node; unsigned long pfn_hi; /* Highest allocated pfn */ unsigned long pfn_lo; /* Lowest allocated pfn */ }; struct iova_rcache; /* holds all the iova translations for a domain */ struct iova_domain { spinlock_t iova_rbtree_lock; /* Lock to protect update of rbtree */ struct rb_root rbroot; /* iova domain rbtree root */ struct rb_node *cached_node; /* Save last alloced node */ struct rb_node *cached32_node; /* Save last 32-bit alloced node */ unsigned long granule; /* pfn granularity for this domain */ unsigned long start_pfn; /* Lower limit for this domain */ unsigned long dma_32bit_pfn; unsigned long max32_alloc_size; /* Size of last failed allocation */ struct iova anchor; /* rbtree lookup anchor */ struct iova_rcache *rcaches; struct hlist_node cpuhp_dead; }; static inline unsigned long iova_size(struct iova *iova) { return iova->pfn_hi - iova->pfn_lo + 1; } static inline unsigned long iova_shift(struct iova_domain *iovad) { return __ffs(iovad->granule); } static inline unsigned long iova_mask(struct iova_domain *iovad) { return iovad->granule - 1; } static inline size_t iova_offset(struct iova_domain *iovad, dma_addr_t iova) { return iova & iova_mask(iovad); } static inline size_t iova_align(struct iova_domain *iovad, size_t size) { return ALIGN(size, iovad->granule); } static inline size_t iova_align_down(struct iova_domain *iovad, size_t size) { return ALIGN_DOWN(size, iovad->granule); } static inline dma_addr_t iova_dma_addr(struct iova_domain *iovad, struct iova *iova) { return (dma_addr_t)iova->pfn_lo << iova_shift(iovad); } static inline unsigned long iova_pfn(struct iova_domain *iovad, dma_addr_t iova) { return iova >> iova_shift(iovad); } #if IS_REACHABLE(CONFIG_IOMMU_IOVA) int iova_cache_get(void); void iova_cache_put(void); unsigned long iova_rcache_range(void); void free_iova(struct iova_domain *iovad, unsigned long pfn); void __free_iova(struct iova_domain *iovad, struct iova *iova); struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size, unsigned long limit_pfn, bool size_aligned); void free_iova_fast(struct iova_domain *iovad, unsigned long pfn, unsigned long size); unsigned long alloc_iova_fast(struct iova_domain *iovad, unsigned long size, unsigned long limit_pfn, bool flush_rcache); struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo, unsigned long pfn_hi); void init_iova_domain(struct iova_domain *iovad, unsigned long granule, unsigned long start_pfn); int iova_domain_init_rcaches(struct iova_domain *iovad); struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn); void put_iova_domain(struct iova_domain *iovad); #else static inline int iova_cache_get(void) { return -ENOTSUPP; } static inline void iova_cache_put(void) { } static inline void free_iova(struct iova_domain *iovad, unsigned long pfn) { } static inline void __free_iova(struct iova_domain *iovad, struct iova *iova) { } static inline struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size, unsigned long limit_pfn, bool size_aligned) { return NULL; } static inline void free_iova_fast(struct iova_domain *iovad, unsigned long pfn, unsigned long size) { } static inline unsigned long alloc_iova_fast(struct iova_domain *iovad, unsigned long size, unsigned long limit_pfn, bool flush_rcache) { return 0; } static inline struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo, unsigned long pfn_hi) { return NULL; } static inline void init_iova_domain(struct iova_domain *iovad, unsigned long granule, unsigned long start_pfn) { } static inline struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn) { return NULL; } static inline void put_iova_domain(struct iova_domain *iovad) { } #endif #endif
23 5 5 5 5 5 173 174 1 163 143 145 163 144 144 144 110 109 76 55 33 89 88 89 89 23 38 23 23 39 38 17 17 44 36 5 4 8 1 1 1 1 3 1 3 1 2 1 4 4 1 1 1 1 27 27 27 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 /* * * Copyright IBM Corporation, 2012 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> * * Cgroup v2 * Copyright (C) 2019 Red Hat, Inc. * Author: Giuseppe Scrivano <gscrivan@redhat.com> * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2.1 of the GNU Lesser General Public License * as published by the Free Software Foundation. * * This program is distributed in the hope that it would be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * */ #include <linux/cgroup.h> #include <linux/page_counter.h> #include <linux/slab.h> #include <linux/hugetlb.h> #include <linux/hugetlb_cgroup.h> #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) /* Use t->m[0] to encode the offset */ #define MEMFILE_OFFSET(t, m0) (((offsetof(t, m0) << 16) | sizeof_field(t, m0))) #define MEMFILE_OFFSET0(val) (((val) >> 16) & 0xffff) #define MEMFILE_FIELD_SIZE(val) ((val) & 0xffff) #define DFL_TMPL_SIZE ARRAY_SIZE(hugetlb_dfl_tmpl) #define LEGACY_TMPL_SIZE ARRAY_SIZE(hugetlb_legacy_tmpl) static struct hugetlb_cgroup *root_h_cgroup __read_mostly; static struct cftype *dfl_files; static struct cftype *legacy_files; static inline struct page_counter * __hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx, bool rsvd) { if (rsvd) return &h_cg->rsvd_hugepage[idx]; return &h_cg->hugepage[idx]; } static inline struct page_counter * hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx) { return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, false); } static inline struct page_counter * hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup *h_cg, int idx) { return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, true); } static inline struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) { return s ? container_of(s, struct hugetlb_cgroup, css) : NULL; } static inline struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) { return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id)); } static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) { return (h_cg == root_h_cgroup); } static inline struct hugetlb_cgroup * parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg) { return hugetlb_cgroup_from_css(h_cg->css.parent); } static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) { struct hstate *h; for_each_hstate(h) { if (page_counter_read( hugetlb_cgroup_counter_from_cgroup(h_cg, hstate_index(h)))) return true; } return false; } static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, struct hugetlb_cgroup *parent_h_cgroup) { int idx; for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) { struct page_counter *fault, *fault_parent = NULL; struct page_counter *rsvd, *rsvd_parent = NULL; unsigned long limit; if (parent_h_cgroup) { fault_parent = hugetlb_cgroup_counter_from_cgroup( parent_h_cgroup, idx); rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd( parent_h_cgroup, idx); } fault = hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx); rsvd = hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx); page_counter_init(fault, fault_parent, false); page_counter_init(rsvd, rsvd_parent, false); if (!cgroup_subsys_on_dfl(hugetlb_cgrp_subsys)) { fault->track_failcnt = true; rsvd->track_failcnt = true; } limit = round_down(PAGE_COUNTER_MAX, pages_per_huge_page(&hstates[idx])); VM_BUG_ON(page_counter_set_max(fault, limit)); VM_BUG_ON(page_counter_set_max(rsvd, limit)); } } static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup) { int node; for_each_node(node) kfree(h_cgroup->nodeinfo[node]); kfree(h_cgroup); } static struct cgroup_subsys_state * hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css); struct hugetlb_cgroup *h_cgroup; int node; h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids), GFP_KERNEL); if (!h_cgroup) return ERR_PTR(-ENOMEM); if (!parent_h_cgroup) root_h_cgroup = h_cgroup; /* * TODO: this routine can waste much memory for nodes which will * never be onlined. It's better to use memory hotplug callback * function. */ for_each_node(node) { /* Set node_to_alloc to NUMA_NO_NODE for offline nodes. */ int node_to_alloc = node_state(node, N_NORMAL_MEMORY) ? node : NUMA_NO_NODE; h_cgroup->nodeinfo[node] = kzalloc_node(sizeof(struct hugetlb_cgroup_per_node), GFP_KERNEL, node_to_alloc); if (!h_cgroup->nodeinfo[node]) goto fail_alloc_nodeinfo; } hugetlb_cgroup_init(h_cgroup, parent_h_cgroup); return &h_cgroup->css; fail_alloc_nodeinfo: hugetlb_cgroup_free(h_cgroup); return ERR_PTR(-ENOMEM); } static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) { hugetlb_cgroup_free(hugetlb_cgroup_from_css(css)); } /* * Should be called with hugetlb_lock held. * Since we are holding hugetlb_lock, pages cannot get moved from * active list or uncharged from the cgroup, So no need to get * page reference and test for page active here. This function * cannot fail. */ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, struct folio *folio) { unsigned int nr_pages; struct page_counter *counter; struct hugetlb_cgroup *hcg; struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); hcg = hugetlb_cgroup_from_folio(folio); /* * We can have pages in active list without any cgroup * ie, hugepage with less than 3 pages. We can safely * ignore those pages. */ if (!hcg || hcg != h_cg) goto out; nr_pages = folio_nr_pages(folio); if (!parent) { parent = root_h_cgroup; /* root has no limit */ page_counter_charge(&parent->hugepage[idx], nr_pages); } counter = &h_cg->hugepage[idx]; /* Take the pages off the local counter */ page_counter_cancel(counter, nr_pages); set_hugetlb_cgroup(folio, parent); out: return; } /* * Force the hugetlb cgroup to empty the hugetlb resources by moving them to * the parent cgroup. */ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) { struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); struct hstate *h; struct folio *folio; do { for_each_hstate(h) { spin_lock_irq(&hugetlb_lock); list_for_each_entry(folio, &h->hugepage_activelist, lru) hugetlb_cgroup_move_parent(hstate_index(h), h_cg, folio); spin_unlock_irq(&hugetlb_lock); } cond_resched(); } while (hugetlb_cgroup_have_usage(h_cg)); } static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx, enum hugetlb_memory_event event) { atomic_long_inc(&hugetlb->events_local[idx][event]); cgroup_file_notify(&hugetlb->events_local_file[idx]); do { atomic_long_inc(&hugetlb->events[idx][event]); cgroup_file_notify(&hugetlb->events_file[idx]); } while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) && !hugetlb_cgroup_is_root(hugetlb)); } static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr, bool rsvd) { int ret = 0; struct page_counter *counter; struct hugetlb_cgroup *h_cg = NULL; if (hugetlb_cgroup_disabled()) goto done; again: rcu_read_lock(); h_cg = hugetlb_cgroup_from_task(current); if (!css_tryget(&h_cg->css)) { rcu_read_unlock(); goto again; } rcu_read_unlock(); if (!page_counter_try_charge( __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), nr_pages, &counter)) { ret = -ENOMEM; hugetlb_event(h_cg, idx, HUGETLB_MAX); css_put(&h_cg->css); goto done; } /* Reservations take a reference to the css because they do not get * reparented. */ if (!rsvd) css_put(&h_cg->css); done: *ptr = h_cg; return ret; } int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr) { return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, false); } int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr) { return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, true); } /* Should be called with hugetlb_lock held */ static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct folio *folio, bool rsvd) { if (hugetlb_cgroup_disabled() || !h_cg) return; lockdep_assert_held(&hugetlb_lock); __set_hugetlb_cgroup(folio, h_cg, rsvd); if (!rsvd) { unsigned long usage = h_cg->nodeinfo[folio_nid(folio)]->usage[idx]; /* * This write is not atomic due to fetching usage and writing * to it, but that's fine because we call this with * hugetlb_lock held anyway. */ WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx], usage + nr_pages); } } void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct folio *folio) { __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, false); } void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct folio *folio) { __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, true); } /* * Should be called with hugetlb_lock held */ static void __hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, struct folio *folio, bool rsvd) { struct hugetlb_cgroup *h_cg; if (hugetlb_cgroup_disabled()) return; lockdep_assert_held(&hugetlb_lock); h_cg = __hugetlb_cgroup_from_folio(folio, rsvd); if (unlikely(!h_cg)) return; __set_hugetlb_cgroup(folio, NULL, rsvd); page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), nr_pages); if (rsvd) css_put(&h_cg->css); else { unsigned long usage = h_cg->nodeinfo[folio_nid(folio)]->usage[idx]; /* * This write is not atomic due to fetching usage and writing * to it, but that's fine because we call this with * hugetlb_lock held anyway. */ WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx], usage - nr_pages); } } void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, struct folio *folio) { __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, false); } void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, struct folio *folio) { __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, true); } static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, bool rsvd) { if (hugetlb_cgroup_disabled() || !h_cg) return; page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), nr_pages); if (rsvd) css_put(&h_cg->css); } void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg) { __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, false); } void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg) { __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, true); } void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start, unsigned long end) { if (hugetlb_cgroup_disabled() || !resv || !resv->reservation_counter || !resv->css) return; page_counter_uncharge(resv->reservation_counter, (end - start) * resv->pages_per_hpage); css_put(resv->css); } void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, struct file_region *rg, unsigned long nr_pages, bool region_del) { if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages) return; if (rg->reservation_counter && resv->pages_per_hpage && !resv->reservation_counter) { page_counter_uncharge(rg->reservation_counter, nr_pages * resv->pages_per_hpage); /* * Only do css_put(rg->css) when we delete the entire region * because one file_region must hold exactly one css reference. */ if (region_del) css_put(rg->css); } } enum { RES_USAGE, RES_RSVD_USAGE, RES_LIMIT, RES_RSVD_LIMIT, RES_MAX_USAGE, RES_RSVD_MAX_USAGE, RES_FAILCNT, RES_RSVD_FAILCNT, }; static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy) { int nid; struct cftype *cft = seq_cft(seq); int idx = MEMFILE_IDX(cft->private); bool legacy = !cgroup_subsys_on_dfl(hugetlb_cgrp_subsys); struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); struct cgroup_subsys_state *css; unsigned long usage; if (legacy) { /* Add up usage across all nodes for the non-hierarchical total. */ usage = 0; for_each_node_state(nid, N_MEMORY) usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]); seq_printf(seq, "total=%lu", usage * PAGE_SIZE); /* Simply print the per-node usage for the non-hierarchical total. */ for_each_node_state(nid, N_MEMORY) seq_printf(seq, " N%d=%lu", nid, READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) * PAGE_SIZE); seq_putc(seq, '\n'); } /* * The hierarchical total is pretty much the value recorded by the * counter, so use that. */ seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "", page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE); /* * For each node, transverse the css tree to obtain the hierarchical * node usage. */ for_each_node_state(nid, N_MEMORY) { usage = 0; rcu_read_lock(); css_for_each_descendant_pre(css, &h_cg->css) { usage += READ_ONCE(hugetlb_cgroup_from_css(css) ->nodeinfo[nid] ->usage[idx]); } rcu_read_unlock(); seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE); } seq_putc(seq, '\n'); return 0; } static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { struct page_counter *counter; struct page_counter *rsvd_counter; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)]; rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(cft->private)]; switch (MEMFILE_ATTR(cft->private)) { case RES_USAGE: return (u64)page_counter_read(counter) * PAGE_SIZE; case RES_RSVD_USAGE: return (u64)page_counter_read(rsvd_counter) * PAGE_SIZE; case RES_LIMIT: return (u64)counter->max * PAGE_SIZE; case RES_RSVD_LIMIT: return (u64)rsvd_counter->max * PAGE_SIZE; case RES_MAX_USAGE: return (u64)counter->watermark * PAGE_SIZE; case RES_RSVD_MAX_USAGE: return (u64)rsvd_counter->watermark * PAGE_SIZE; case RES_FAILCNT: return counter->failcnt; case RES_RSVD_FAILCNT: return rsvd_counter->failcnt; default: BUG(); } } static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v) { int idx; u64 val; struct cftype *cft = seq_cft(seq); unsigned long limit; struct page_counter *counter; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); idx = MEMFILE_IDX(cft->private); counter = &h_cg->hugepage[idx]; limit = round_down(PAGE_COUNTER_MAX, pages_per_huge_page(&hstates[idx])); switch (MEMFILE_ATTR(cft->private)) { case RES_RSVD_USAGE: counter = &h_cg->rsvd_hugepage[idx]; fallthrough; case RES_USAGE: val = (u64)page_counter_read(counter); seq_printf(seq, "%llu\n", val * PAGE_SIZE); break; case RES_RSVD_LIMIT: counter = &h_cg->rsvd_hugepage[idx]; fallthrough; case RES_LIMIT: val = (u64)counter->max; if (val == limit) seq_puts(seq, "max\n"); else seq_printf(seq, "%llu\n", val * PAGE_SIZE); break; default: BUG(); } return 0; } static DEFINE_MUTEX(hugetlb_limit_mutex); static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, const char *max) { int ret, idx; unsigned long nr_pages; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); bool rsvd = false; if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */ return -EINVAL; buf = strstrip(buf); ret = page_counter_memparse(buf, max, &nr_pages); if (ret) return ret; idx = MEMFILE_IDX(of_cft(of)->private); nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx])); switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_RSVD_LIMIT: rsvd = true; fallthrough; case RES_LIMIT: mutex_lock(&hugetlb_limit_mutex); ret = page_counter_set_max( __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), nr_pages); mutex_unlock(&hugetlb_limit_mutex); break; default: ret = -EINVAL; break; } return ret ?: nbytes; } static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return hugetlb_cgroup_write(of, buf, nbytes, off, "-1"); } static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return hugetlb_cgroup_write(of, buf, nbytes, off, "max"); } static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { int ret = 0; struct page_counter *counter, *rsvd_counter; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)]; rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(of_cft(of)->private)]; switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_MAX_USAGE: page_counter_reset_watermark(counter); break; case RES_RSVD_MAX_USAGE: page_counter_reset_watermark(rsvd_counter); break; case RES_FAILCNT: counter->failcnt = 0; break; case RES_RSVD_FAILCNT: rsvd_counter->failcnt = 0; break; default: ret = -EINVAL; break; } return ret ?: nbytes; } static char *mem_fmt(char *buf, int size, unsigned long hsize) { if (hsize >= SZ_1G) snprintf(buf, size, "%luGB", hsize / SZ_1G); else if (hsize >= SZ_1M) snprintf(buf, size, "%luMB", hsize / SZ_1M); else snprintf(buf, size, "%luKB", hsize / SZ_1K); return buf; } static int __hugetlb_events_show(struct seq_file *seq, bool local) { int idx; long max; struct cftype *cft = seq_cft(seq); struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); idx = MEMFILE_IDX(cft->private); if (local) max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]); else max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]); seq_printf(seq, "max %lu\n", max); return 0; } static int hugetlb_events_show(struct seq_file *seq, void *v) { return __hugetlb_events_show(seq, false); } static int hugetlb_events_local_show(struct seq_file *seq, void *v) { return __hugetlb_events_show(seq, true); } static struct cftype hugetlb_dfl_tmpl[] = { { .name = "max", .private = RES_LIMIT, .seq_show = hugetlb_cgroup_read_u64_max, .write = hugetlb_cgroup_write_dfl, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "rsvd.max", .private = RES_RSVD_LIMIT, .seq_show = hugetlb_cgroup_read_u64_max, .write = hugetlb_cgroup_write_dfl, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "current", .private = RES_USAGE, .seq_show = hugetlb_cgroup_read_u64_max, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "rsvd.current", .private = RES_RSVD_USAGE, .seq_show = hugetlb_cgroup_read_u64_max, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "events", .seq_show = hugetlb_events_show, .file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_file[0]), .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "events.local", .seq_show = hugetlb_events_local_show, .file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_local_file[0]), .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "numa_stat", .seq_show = hugetlb_cgroup_read_numa_stat, .flags = CFTYPE_NOT_ON_ROOT, }, /* don't need terminator here */ }; static struct cftype hugetlb_legacy_tmpl[] = { { .name = "limit_in_bytes", .private = RES_LIMIT, .read_u64 = hugetlb_cgroup_read_u64, .write = hugetlb_cgroup_write_legacy, }, { .name = "rsvd.limit_in_bytes", .private = RES_RSVD_LIMIT, .read_u64 = hugetlb_cgroup_read_u64, .write = hugetlb_cgroup_write_legacy, }, { .name = "usage_in_bytes", .private = RES_USAGE, .read_u64 = hugetlb_cgroup_read_u64, }, { .name = "rsvd.usage_in_bytes", .private = RES_RSVD_USAGE, .read_u64 = hugetlb_cgroup_read_u64, }, { .name = "max_usage_in_bytes", .private = RES_MAX_USAGE, .write = hugetlb_cgroup_reset, .read_u64 = hugetlb_cgroup_read_u64, }, { .name = "rsvd.max_usage_in_bytes", .private = RES_RSVD_MAX_USAGE, .write = hugetlb_cgroup_reset, .read_u64 = hugetlb_cgroup_read_u64, }, { .name = "failcnt", .private = RES_FAILCNT, .write = hugetlb_cgroup_reset, .read_u64 = hugetlb_cgroup_read_u64, }, { .name = "rsvd.failcnt", .private = RES_RSVD_FAILCNT, .write = hugetlb_cgroup_reset, .read_u64 = hugetlb_cgroup_read_u64, }, { .name = "numa_stat", .seq_show = hugetlb_cgroup_read_numa_stat, }, /* don't need terminator here */ }; static void __init hugetlb_cgroup_cfttypes_init(struct hstate *h, struct cftype *cft, struct cftype *tmpl, int tmpl_size) { char buf[32]; int i, idx = hstate_index(h); /* format the size */ mem_fmt(buf, sizeof(buf), huge_page_size(h)); for (i = 0; i < tmpl_size; cft++, tmpl++, i++) { *cft = *tmpl; /* rebuild the name */ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.%s", buf, tmpl->name); /* rebuild the private */ cft->private = MEMFILE_PRIVATE(idx, tmpl->private); /* rebuild the file_offset */ if (tmpl->file_offset) { unsigned int offset = tmpl->file_offset; cft->file_offset = MEMFILE_OFFSET0(offset) + MEMFILE_FIELD_SIZE(offset) * idx; } lockdep_register_key(&cft->lockdep_key); } } static void __init __hugetlb_cgroup_file_dfl_init(struct hstate *h) { int idx = hstate_index(h); hugetlb_cgroup_cfttypes_init(h, dfl_files + idx * DFL_TMPL_SIZE, hugetlb_dfl_tmpl, DFL_TMPL_SIZE); } static void __init __hugetlb_cgroup_file_legacy_init(struct hstate *h) { int idx = hstate_index(h); hugetlb_cgroup_cfttypes_init(h, legacy_files + idx * LEGACY_TMPL_SIZE, hugetlb_legacy_tmpl, LEGACY_TMPL_SIZE); } static void __init __hugetlb_cgroup_file_init(struct hstate *h) { __hugetlb_cgroup_file_dfl_init(h); __hugetlb_cgroup_file_legacy_init(h); } static void __init __hugetlb_cgroup_file_pre_init(void) { int cft_count; cft_count = hugetlb_max_hstate * DFL_TMPL_SIZE + 1; /* add terminator */ dfl_files = kcalloc(cft_count, sizeof(struct cftype), GFP_KERNEL); BUG_ON(!dfl_files); cft_count = hugetlb_max_hstate * LEGACY_TMPL_SIZE + 1; /* add terminator */ legacy_files = kcalloc(cft_count, sizeof(struct cftype), GFP_KERNEL); BUG_ON(!legacy_files); } static void __init __hugetlb_cgroup_file_post_init(void) { WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys, dfl_files)); WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, legacy_files)); } void __init hugetlb_cgroup_file_init(void) { struct hstate *h; __hugetlb_cgroup_file_pre_init(); for_each_hstate(h) __hugetlb_cgroup_file_init(h); __hugetlb_cgroup_file_post_init(); } /* * hugetlb_lock will make sure a parallel cgroup rmdir won't happen * when we migrate hugepages */ void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio) { struct hugetlb_cgroup *h_cg; struct hugetlb_cgroup *h_cg_rsvd; struct hstate *h = folio_hstate(old_folio); if (hugetlb_cgroup_disabled()) return; spin_lock_irq(&hugetlb_lock); h_cg = hugetlb_cgroup_from_folio(old_folio); h_cg_rsvd = hugetlb_cgroup_from_folio_rsvd(old_folio); set_hugetlb_cgroup(old_folio, NULL); set_hugetlb_cgroup_rsvd(old_folio, NULL); /* move the h_cg details to new cgroup */ set_hugetlb_cgroup(new_folio, h_cg); set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd); list_move(&new_folio->lru, &h->hugepage_activelist); spin_unlock_irq(&hugetlb_lock); } static struct cftype hugetlb_files[] = { {} /* terminate */ }; struct cgroup_subsys hugetlb_cgrp_subsys = { .css_alloc = hugetlb_cgroup_css_alloc, .css_offline = hugetlb_cgroup_css_offline, .css_free = hugetlb_cgroup_css_free, .dfl_cftypes = hugetlb_files, .legacy_cftypes = hugetlb_files, };
191 191 190 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 // SPDX-License-Identifier: GPL-2.0-only #include <linux/virtio.h> #include <linux/spinlock.h> #include <linux/virtio_config.h> #include <linux/virtio_anchor.h> #include <linux/module.h> #include <linux/idr.h> #include <linux/of.h> #include <uapi/linux/virtio_ids.h> /* Unique numbering for virtio devices. */ static DEFINE_IDA(virtio_index_ida); static ssize_t device_show(struct device *_d, struct device_attribute *attr, char *buf) { struct virtio_device *dev = dev_to_virtio(_d); return sysfs_emit(buf, "0x%04x\n", dev->id.device); } static DEVICE_ATTR_RO(device); static ssize_t vendor_show(struct device *_d, struct device_attribute *attr, char *buf) { struct virtio_device *dev = dev_to_virtio(_d); return sysfs_emit(buf, "0x%04x\n", dev->id.vendor); } static DEVICE_ATTR_RO(vendor); static ssize_t status_show(struct device *_d, struct device_attribute *attr, char *buf) { struct virtio_device *dev = dev_to_virtio(_d); return sysfs_emit(buf, "0x%08x\n", dev->config->get_status(dev)); } static DEVICE_ATTR_RO(status); static ssize_t modalias_show(struct device *_d, struct device_attribute *attr, char *buf) { struct virtio_device *dev = dev_to_virtio(_d); return sysfs_emit(buf, "virtio:d%08Xv%08X\n", dev->id.device, dev->id.vendor); } static DEVICE_ATTR_RO(modalias); static ssize_t features_show(struct device *_d, struct device_attribute *attr, char *buf) { struct virtio_device *dev = dev_to_virtio(_d); unsigned int i; ssize_t len = 0; /* We actually represent this as a bitstring, as it could be * arbitrary length in future. */ for (i = 0; i < VIRTIO_FEATURES_MAX; i++) len += sysfs_emit_at(buf, len, "%c", __virtio_test_bit(dev, i) ? '1' : '0'); len += sysfs_emit_at(buf, len, "\n"); return len; } static DEVICE_ATTR_RO(features); static struct attribute *virtio_dev_attrs[] = { &dev_attr_device.attr, &dev_attr_vendor.attr, &dev_attr_status.attr, &dev_attr_modalias.attr, &dev_attr_features.attr, NULL, }; ATTRIBUTE_GROUPS(virtio_dev); static inline int virtio_id_match(const struct virtio_device *dev, const struct virtio_device_id *id) { if (id->device != dev->id.device && id->device != VIRTIO_DEV_ANY_ID) return 0; return id->vendor == VIRTIO_DEV_ANY_ID || id->vendor == dev->id.vendor; } /* This looks through all the IDs a driver claims to support. If any of them * match, we return 1 and the kernel will call virtio_dev_probe(). */ static int virtio_dev_match(struct device *_dv, const struct device_driver *_dr) { unsigned int i; struct virtio_device *dev = dev_to_virtio(_dv); const struct virtio_device_id *ids; ids = drv_to_virtio(_dr)->id_table; for (i = 0; ids[i].device; i++) if (virtio_id_match(dev, &ids[i])) return 1; return 0; } static int virtio_uevent(const struct device *_dv, struct kobj_uevent_env *env) { const struct virtio_device *dev = dev_to_virtio(_dv); return add_uevent_var(env, "MODALIAS=virtio:d%08Xv%08X", dev->id.device, dev->id.vendor); } void virtio_check_driver_offered_feature(const struct virtio_device *vdev, unsigned int fbit) { unsigned int i; struct virtio_driver *drv = drv_to_virtio(vdev->dev.driver); for (i = 0; i < drv->feature_table_size; i++) if (drv->feature_table[i] == fbit) return; if (drv->feature_table_legacy) { for (i = 0; i < drv->feature_table_size_legacy; i++) if (drv->feature_table_legacy[i] == fbit) return; } BUG(); } EXPORT_SYMBOL_GPL(virtio_check_driver_offered_feature); static void __virtio_config_changed(struct virtio_device *dev) { struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); if (!dev->config_core_enabled || dev->config_driver_disabled) dev->config_change_pending = true; else if (drv && drv->config_changed) { drv->config_changed(dev); dev->config_change_pending = false; } } void virtio_config_changed(struct virtio_device *dev) { unsigned long flags; spin_lock_irqsave(&dev->config_lock, flags); __virtio_config_changed(dev); spin_unlock_irqrestore(&dev->config_lock, flags); } EXPORT_SYMBOL_GPL(virtio_config_changed); /** * virtio_config_driver_disable - disable config change reporting by drivers * @dev: the device to disable * * This is only allowed to be called by a driver and disabling can't * be nested. */ void virtio_config_driver_disable(struct virtio_device *dev) { spin_lock_irq(&dev->config_lock); dev->config_driver_disabled = true; spin_unlock_irq(&dev->config_lock); } EXPORT_SYMBOL_GPL(virtio_config_driver_disable); /** * virtio_config_driver_enable - enable config change reporting by drivers * @dev: the device to enable * * This is only allowed to be called by a driver and enabling can't * be nested. */ void virtio_config_driver_enable(struct virtio_device *dev) { spin_lock_irq(&dev->config_lock); dev->config_driver_disabled = false; if (dev->config_change_pending) __virtio_config_changed(dev); spin_unlock_irq(&dev->config_lock); } EXPORT_SYMBOL_GPL(virtio_config_driver_enable); static void virtio_config_core_disable(struct virtio_device *dev) { spin_lock_irq(&dev->config_lock); dev->config_core_enabled = false; spin_unlock_irq(&dev->config_lock); } static void virtio_config_core_enable(struct virtio_device *dev) { spin_lock_irq(&dev->config_lock); dev->config_core_enabled = true; if (dev->config_change_pending) __virtio_config_changed(dev); spin_unlock_irq(&dev->config_lock); } void virtio_add_status(struct virtio_device *dev, unsigned int status) { might_sleep(); dev->config->set_status(dev, dev->config->get_status(dev) | status); } EXPORT_SYMBOL_GPL(virtio_add_status); /* Do some validation, then set FEATURES_OK */ static int virtio_features_ok(struct virtio_device *dev) { unsigned int status; might_sleep(); if (virtio_check_mem_acc_cb(dev)) { if (!virtio_has_feature(dev, VIRTIO_F_VERSION_1)) { dev_warn(&dev->dev, "device must provide VIRTIO_F_VERSION_1\n"); return -ENODEV; } if (!virtio_has_feature(dev, VIRTIO_F_ACCESS_PLATFORM)) { dev_warn(&dev->dev, "device must provide VIRTIO_F_ACCESS_PLATFORM\n"); return -ENODEV; } } if (!virtio_has_feature(dev, VIRTIO_F_VERSION_1)) return 0; virtio_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK); status = dev->config->get_status(dev); if (!(status & VIRTIO_CONFIG_S_FEATURES_OK)) { dev_err(&dev->dev, "virtio: device refuses features: %x\n", status); return -ENODEV; } return 0; } /** * virtio_reset_device - quiesce device for removal * @dev: the device to reset * * Prevents device from sending interrupts and accessing memory. * * Generally used for cleanup during driver / device removal. * * Once this has been invoked, caller must ensure that * virtqueue_notify / virtqueue_kick are not in progress. * * Note: this guarantees that vq callbacks are not in progress, however caller * is responsible for preventing access from other contexts, such as a system * call/workqueue/bh. Invoking virtio_break_device then flushing any such * contexts is one way to handle that. * */ void virtio_reset_device(struct virtio_device *dev) { #ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION /* * The below virtio_synchronize_cbs() guarantees that any * interrupt for this line arriving after * virtio_synchronize_vqs() has completed is guaranteed to see * vq->broken as true. */ virtio_break_device(dev); virtio_synchronize_cbs(dev); #endif dev->config->reset(dev); } EXPORT_SYMBOL_GPL(virtio_reset_device); static int virtio_dev_probe(struct device *_d) { int err, i; struct virtio_device *dev = dev_to_virtio(_d); struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); u64 device_features[VIRTIO_FEATURES_DWORDS]; u64 driver_features[VIRTIO_FEATURES_DWORDS]; u64 driver_features_legacy; /* We have a driver! */ virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER); /* Figure out what features the device supports. */ virtio_get_features(dev, device_features); /* Figure out what features the driver supports. */ virtio_features_zero(driver_features); for (i = 0; i < drv->feature_table_size; i++) { unsigned int f = drv->feature_table[i]; if (!WARN_ON_ONCE(f >= VIRTIO_FEATURES_MAX)) virtio_features_set_bit(driver_features, f); } /* Some drivers have a separate feature table for virtio v1.0 */ if (drv->feature_table_legacy) { driver_features_legacy = 0; for (i = 0; i < drv->feature_table_size_legacy; i++) { unsigned int f = drv->feature_table_legacy[i]; if (!WARN_ON_ONCE(f >= 64)) driver_features_legacy |= (1ULL << f); } } else { driver_features_legacy = driver_features[0]; } if (virtio_features_test_bit(device_features, VIRTIO_F_VERSION_1)) { for (i = 0; i < VIRTIO_FEATURES_DWORDS; ++i) dev->features_array[i] = driver_features[i] & device_features[i]; } else { virtio_features_from_u64(dev->features_array, driver_features_legacy & device_features[0]); } /* When debugging, user may filter some features by hand. */ virtio_debug_device_filter_features(dev); /* Transport features always preserved to pass to finalize_features. */ for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) if (virtio_features_test_bit(device_features, i)) __virtio_set_bit(dev, i); err = dev->config->finalize_features(dev); if (err) goto err; if (drv->validate) { u64 features[VIRTIO_FEATURES_DWORDS]; virtio_features_copy(features, dev->features_array); err = drv->validate(dev); if (err) goto err; /* Did validation change any features? Then write them again. */ if (!virtio_features_equal(features, dev->features_array)) { err = dev->config->finalize_features(dev); if (err) goto err; } } err = virtio_features_ok(dev); if (err) goto err; err = drv->probe(dev); if (err) goto err; /* If probe didn't do it, mark device DRIVER_OK ourselves. */ if (!(dev->config->get_status(dev) & VIRTIO_CONFIG_S_DRIVER_OK)) virtio_device_ready(dev); if (drv->scan) drv->scan(dev); virtio_config_core_enable(dev); return 0; err: virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED); return err; } static void virtio_dev_remove(struct device *_d) { struct virtio_device *dev = dev_to_virtio(_d); struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); virtio_config_core_disable(dev); drv->remove(dev); /* Driver should have reset device. */ WARN_ON_ONCE(dev->config->get_status(dev)); /* Acknowledge the device's existence again. */ virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); of_node_put(dev->dev.of_node); } /* * virtio_irq_get_affinity - get IRQ affinity mask for device * @_d: ptr to dev structure * @irq_vec: interrupt vector number * * Return the CPU affinity mask for @_d and @irq_vec. */ static const struct cpumask *virtio_irq_get_affinity(struct device *_d, unsigned int irq_vec) { struct virtio_device *dev = dev_to_virtio(_d); if (!dev->config->get_vq_affinity) return NULL; return dev->config->get_vq_affinity(dev, irq_vec); } static void virtio_dev_shutdown(struct device *_d) { struct virtio_device *dev = dev_to_virtio(_d); struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); /* * Stop accesses to or from the device. * We only need to do it if there's a driver - no accesses otherwise. */ if (!drv) return; /* If the driver has its own shutdown method, use that. */ if (drv->shutdown) { drv->shutdown(dev); return; } /* * Some devices get wedged if you kick them after they are * reset. Mark all vqs as broken to make sure we don't. */ virtio_break_device(dev); /* * Guarantee that any callback will see vq->broken as true. */ virtio_synchronize_cbs(dev); /* * As IOMMUs are reset on shutdown, this will block device access to memory. * Some devices get wedged if this happens, so reset to make sure it does not. */ dev->config->reset(dev); } static const struct bus_type virtio_bus = { .name = "virtio", .match = virtio_dev_match, .dev_groups = virtio_dev_groups, .uevent = virtio_uevent, .probe = virtio_dev_probe, .remove = virtio_dev_remove, .irq_get_affinity = virtio_irq_get_affinity, .shutdown = virtio_dev_shutdown, }; int __register_virtio_driver(struct virtio_driver *driver, struct module *owner) { /* Catch this early. */ BUG_ON(driver->feature_table_size && !driver->feature_table); driver->driver.bus = &virtio_bus; driver->driver.owner = owner; return driver_register(&driver->driver); } EXPORT_SYMBOL_GPL(__register_virtio_driver); void unregister_virtio_driver(struct virtio_driver *driver) { driver_unregister(&driver->driver); } EXPORT_SYMBOL_GPL(unregister_virtio_driver); static int virtio_device_of_init(struct virtio_device *dev) { struct device_node *np, *pnode = dev_of_node(dev->dev.parent); char compat[] = "virtio,deviceXXXXXXXX"; int ret, count; if (!pnode) return 0; count = of_get_available_child_count(pnode); if (!count) return 0; /* There can be only 1 child node */ if (WARN_ON(count > 1)) return -EINVAL; np = of_get_next_available_child(pnode, NULL); if (WARN_ON(!np)) return -ENODEV; ret = snprintf(compat, sizeof(compat), "virtio,device%x", dev->id.device); BUG_ON(ret >= sizeof(compat)); /* * On powerpc/pseries virtio devices are PCI devices so PCI * vendor/device ids play the role of the "compatible" property. * Simply don't init of_node in this case. */ if (!of_device_is_compatible(np, compat)) { ret = 0; goto out; } dev->dev.of_node = np; return 0; out: of_node_put(np); return ret; } /** * register_virtio_device - register virtio device * @dev : virtio device to be registered * * On error, the caller must call put_device on &@dev->dev (and not kfree), * as another code path may have obtained a reference to @dev. * * Returns: 0 on success, -error on failure */ int register_virtio_device(struct virtio_device *dev) { int err; dev->dev.bus = &virtio_bus; device_initialize(&dev->dev); /* Assign a unique device index and hence name. */ err = ida_alloc(&virtio_index_ida, GFP_KERNEL); if (err < 0) goto out; dev->index = err; err = dev_set_name(&dev->dev, "virtio%u", dev->index); if (err) goto out_ida_remove; err = virtio_device_of_init(dev); if (err) goto out_ida_remove; spin_lock_init(&dev->config_lock); dev->config_driver_disabled = false; dev->config_core_enabled = false; dev->config_change_pending = false; INIT_LIST_HEAD(&dev->vqs); spin_lock_init(&dev->vqs_list_lock); /* We always start by resetting the device, in case a previous * driver messed it up. This also tests that code path a little. */ virtio_reset_device(dev); /* Acknowledge that we've seen the device. */ virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); virtio_debug_device_init(dev); /* * device_add() causes the bus infrastructure to look for a matching * driver. */ err = device_add(&dev->dev); if (err) goto out_of_node_put; return 0; out_of_node_put: of_node_put(dev->dev.of_node); out_ida_remove: ida_free(&virtio_index_ida, dev->index); out: virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED); return err; } EXPORT_SYMBOL_GPL(register_virtio_device); bool is_virtio_device(struct device *dev) { return dev->bus == &virtio_bus; } EXPORT_SYMBOL_GPL(is_virtio_device); void unregister_virtio_device(struct virtio_device *dev) { int index = dev->index; /* save for after device release */ device_unregister(&dev->dev); virtio_debug_device_exit(dev); ida_free(&virtio_index_ida, index); } EXPORT_SYMBOL_GPL(unregister_virtio_device); static int virtio_device_restore_priv(struct virtio_device *dev, bool restore) { struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); int ret; /* We always start by resetting the device, in case a previous * driver messed it up. */ virtio_reset_device(dev); /* Acknowledge that we've seen the device. */ virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); /* Maybe driver failed before freeze. * Restore the failed status, for debugging. */ if (dev->failed) virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED); if (!drv) return 0; /* We have a driver! */ virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER); ret = dev->config->finalize_features(dev); if (ret) goto err; ret = virtio_features_ok(dev); if (ret) goto err; if (restore) { if (drv->restore) { ret = drv->restore(dev); if (ret) goto err; } } else { ret = drv->reset_done(dev); if (ret) goto err; } /* If restore didn't do it, mark device DRIVER_OK ourselves. */ if (!(dev->config->get_status(dev) & VIRTIO_CONFIG_S_DRIVER_OK)) virtio_device_ready(dev); virtio_config_core_enable(dev); return 0; err: virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED); return ret; } #ifdef CONFIG_PM_SLEEP int virtio_device_freeze(struct virtio_device *dev) { struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); int ret; virtio_config_core_disable(dev); dev->failed = dev->config->get_status(dev) & VIRTIO_CONFIG_S_FAILED; if (drv && drv->freeze) { ret = drv->freeze(dev); if (ret) { virtio_config_core_enable(dev); return ret; } } return 0; } EXPORT_SYMBOL_GPL(virtio_device_freeze); int virtio_device_restore(struct virtio_device *dev) { return virtio_device_restore_priv(dev, true); } EXPORT_SYMBOL_GPL(virtio_device_restore); #endif int virtio_device_reset_prepare(struct virtio_device *dev) { struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); int ret; if (!drv || !drv->reset_prepare) return -EOPNOTSUPP; virtio_config_core_disable(dev); dev->failed = dev->config->get_status(dev) & VIRTIO_CONFIG_S_FAILED; ret = drv->reset_prepare(dev); if (ret) { virtio_config_core_enable(dev); return ret; } return 0; } EXPORT_SYMBOL_GPL(virtio_device_reset_prepare); int virtio_device_reset_done(struct virtio_device *dev) { struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); if (!drv || !drv->reset_done) return -EOPNOTSUPP; return virtio_device_restore_priv(dev, false); } EXPORT_SYMBOL_GPL(virtio_device_reset_done); static int virtio_init(void) { BUILD_BUG_ON(offsetof(struct virtio_device, features) != offsetof(struct virtio_device, features_array[0])); if (bus_register(&virtio_bus) != 0) panic("virtio bus registration failed"); virtio_debug_init(); return 0; } static void __exit virtio_exit(void) { virtio_debug_exit(); bus_unregister(&virtio_bus); ida_destroy(&virtio_index_ida); } core_initcall(virtio_init); module_exit(virtio_exit); MODULE_DESCRIPTION("Virtio core interface"); MODULE_LICENSE("GPL");
1 15 3 13 14 2 12 13 13 1 1 18 3 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 3 16 7 4 4 3 4 4 3 3 3 3 3 3 3 3 11 15 11 11 9 9 9 5 9 677 672 6 11 11 11 9 9 9 8 1 1 6 11 2 5 4 167 1 3 2 3 155 3 6 5 11 12 2 12 3 9 12 2 2 2 2 2 103 102 5 127 127 6 8 2 6 2 4 5 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 // SPDX-License-Identifier: GPL-2.0-only /* * Memory merging support. * * This code enables dynamic sharing of identical pages found in different * memory areas, even if they are not shared by fork() * * Copyright (C) 2008-2009 Red Hat, Inc. * Authors: * Izik Eidus * Andrea Arcangeli * Chris Wright * Hugh Dickins */ #include <linux/errno.h> #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/fs.h> #include <linux/mman.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/cputime.h> #include <linux/rwsem.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/spinlock.h> #include <linux/xxhash.h> #include <linux/delay.h> #include <linux/kthread.h> #include <linux/wait.h> #include <linux/slab.h> #include <linux/rbtree.h> #include <linux/memory.h> #include <linux/mmu_notifier.h> #include <linux/swap.h> #include <linux/ksm.h> #include <linux/hashtable.h> #include <linux/freezer.h> #include <linux/oom.h> #include <linux/numa.h> #include <linux/pagewalk.h> #include <asm/tlbflush.h> #include "internal.h" #include "mm_slot.h" #define CREATE_TRACE_POINTS #include <trace/events/ksm.h> #ifdef CONFIG_NUMA #define NUMA(x) (x) #define DO_NUMA(x) do { (x); } while (0) #else #define NUMA(x) (0) #define DO_NUMA(x) do { } while (0) #endif typedef u8 rmap_age_t; /** * DOC: Overview * * A few notes about the KSM scanning process, * to make it easier to understand the data structures below: * * In order to reduce excessive scanning, KSM sorts the memory pages by their * contents into a data structure that holds pointers to the pages' locations. * * Since the contents of the pages may change at any moment, KSM cannot just * insert the pages into a normal sorted tree and expect it to find anything. * Therefore KSM uses two data structures - the stable and the unstable tree. * * The stable tree holds pointers to all the merged pages (ksm pages), sorted * by their contents. Because each such page is write-protected, searching on * this tree is fully assured to be working (except when pages are unmapped), * and therefore this tree is called the stable tree. * * The stable tree node includes information required for reverse * mapping from a KSM page to virtual addresses that map this page. * * In order to avoid large latencies of the rmap walks on KSM pages, * KSM maintains two types of nodes in the stable tree: * * * the regular nodes that keep the reverse mapping structures in a * linked list * * the "chains" that link nodes ("dups") that represent the same * write protected memory content, but each "dup" corresponds to a * different KSM page copy of that content * * Internally, the regular nodes, "dups" and "chains" are represented * using the same struct ksm_stable_node structure. * * In addition to the stable tree, KSM uses a second data structure called the * unstable tree: this tree holds pointers to pages which have been found to * be "unchanged for a period of time". The unstable tree sorts these pages * by their contents, but since they are not write-protected, KSM cannot rely * upon the unstable tree to work correctly - the unstable tree is liable to * be corrupted as its contents are modified, and so it is called unstable. * * KSM solves this problem by several techniques: * * 1) The unstable tree is flushed every time KSM completes scanning all * memory areas, and then the tree is rebuilt again from the beginning. * 2) KSM will only insert into the unstable tree, pages whose hash value * has not changed since the previous scan of all memory areas. * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the * colors of the nodes and not on their contents, assuring that even when * the tree gets "corrupted" it won't get out of balance, so scanning time * remains the same (also, searching and inserting nodes in an rbtree uses * the same algorithm, so we have no overhead when we flush and rebuild). * 4) KSM never flushes the stable tree, which means that even if it were to * take 10 attempts to find a page in the unstable tree, once it is found, * it is secured in the stable tree. (When we scan a new page, we first * compare it against the stable tree, and then against the unstable tree.) * * If the merge_across_nodes tunable is unset, then KSM maintains multiple * stable trees and multiple unstable trees: one of each for each NUMA node. */ /** * struct ksm_mm_slot - ksm information per mm that is being scanned * @slot: hash lookup from mm to mm_slot * @rmap_list: head for this mm_slot's singly-linked list of rmap_items */ struct ksm_mm_slot { struct mm_slot slot; struct ksm_rmap_item *rmap_list; }; /** * struct ksm_scan - cursor for scanning * @mm_slot: the current mm_slot we are scanning * @address: the next address inside that to be scanned * @rmap_list: link to the next rmap to be scanned in the rmap_list * @seqnr: count of completed full scans (needed when removing unstable node) * * There is only the one ksm_scan instance of this cursor structure. */ struct ksm_scan { struct ksm_mm_slot *mm_slot; unsigned long address; struct ksm_rmap_item **rmap_list; unsigned long seqnr; }; /** * struct ksm_stable_node - node of the stable rbtree * @node: rb node of this ksm page in the stable tree * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list * @hlist_dup: linked into the stable_node->hlist with a stable_node chain * @list: linked into migrate_nodes, pending placement in the proper node tree * @hlist: hlist head of rmap_items using this ksm page * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid) * @chain_prune_time: time of the last full garbage collection * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN * @nid: NUMA node id of stable tree in which linked (may not match kpfn) */ struct ksm_stable_node { union { struct rb_node node; /* when node of stable tree */ struct { /* when listed for migration */ struct list_head *head; struct { struct hlist_node hlist_dup; struct list_head list; }; }; }; struct hlist_head hlist; union { unsigned long kpfn; unsigned long chain_prune_time; }; /* * STABLE_NODE_CHAIN can be any negative number in * rmap_hlist_len negative range, but better not -1 to be able * to reliably detect underflows. */ #define STABLE_NODE_CHAIN -1024 int rmap_hlist_len; #ifdef CONFIG_NUMA int nid; #endif }; /** * struct ksm_rmap_item - reverse mapping item for virtual addresses * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree * @nid: NUMA node id of unstable tree in which linked (may not match page) * @mm: the memory structure this rmap_item is pointing into * @address: the virtual address this rmap_item tracks (+ flags in low bits) * @oldchecksum: previous checksum of the page at that virtual address * @node: rb node of this rmap_item in the unstable tree * @head: pointer to stable_node heading this list in the stable tree * @hlist: link into hlist of rmap_items hanging off that stable_node * @age: number of scan iterations since creation * @remaining_skips: how many scans to skip */ struct ksm_rmap_item { struct ksm_rmap_item *rmap_list; union { struct anon_vma *anon_vma; /* when stable */ #ifdef CONFIG_NUMA int nid; /* when node of unstable tree */ #endif }; struct mm_struct *mm; unsigned long address; /* + low bits used for flags below */ unsigned int oldchecksum; /* when unstable */ rmap_age_t age; rmap_age_t remaining_skips; union { struct rb_node node; /* when node of unstable tree */ struct { /* when listed from stable tree */ struct ksm_stable_node *head; struct hlist_node hlist; }; }; }; #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ #define STABLE_FLAG 0x200 /* is listed from the stable tree */ /* The stable and unstable tree heads */ static struct rb_root one_stable_tree[1] = { RB_ROOT }; static struct rb_root one_unstable_tree[1] = { RB_ROOT }; static struct rb_root *root_stable_tree = one_stable_tree; static struct rb_root *root_unstable_tree = one_unstable_tree; /* Recently migrated nodes of stable tree, pending proper placement */ static LIST_HEAD(migrate_nodes); #define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev) #define MM_SLOTS_HASH_BITS 10 static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); static struct ksm_mm_slot ksm_mm_head = { .slot.mm_node = LIST_HEAD_INIT(ksm_mm_head.slot.mm_node), }; static struct ksm_scan ksm_scan = { .mm_slot = &ksm_mm_head, }; static struct kmem_cache *rmap_item_cache; static struct kmem_cache *stable_node_cache; static struct kmem_cache *mm_slot_cache; /* Default number of pages to scan per batch */ #define DEFAULT_PAGES_TO_SCAN 100 /* The number of pages scanned */ static unsigned long ksm_pages_scanned; /* The number of nodes in the stable tree */ static unsigned long ksm_pages_shared; /* The number of page slots additionally sharing those nodes */ static unsigned long ksm_pages_sharing; /* The number of nodes in the unstable tree */ static unsigned long ksm_pages_unshared; /* The number of rmap_items in use: to calculate pages_volatile */ static unsigned long ksm_rmap_items; /* The number of stable_node chains */ static unsigned long ksm_stable_node_chains; /* The number of stable_node dups linked to the stable_node chains */ static unsigned long ksm_stable_node_dups; /* Delay in pruning stale stable_node_dups in the stable_node_chains */ static unsigned int ksm_stable_node_chains_prune_millisecs = 2000; /* Maximum number of page slots sharing a stable node */ static int ksm_max_page_sharing = 256; /* Number of pages ksmd should scan in one batch */ static unsigned int ksm_thread_pages_to_scan = DEFAULT_PAGES_TO_SCAN; /* Milliseconds ksmd should sleep between batches */ static unsigned int ksm_thread_sleep_millisecs = 20; /* Checksum of an empty (zeroed) page */ static unsigned int zero_checksum __read_mostly; /* Whether to merge empty (zeroed) pages with actual zero pages */ static bool ksm_use_zero_pages __read_mostly; /* Skip pages that couldn't be de-duplicated previously */ /* Default to true at least temporarily, for testing */ static bool ksm_smart_scan = true; /* The number of zero pages which is placed by KSM */ atomic_long_t ksm_zero_pages = ATOMIC_LONG_INIT(0); /* The number of pages that have been skipped due to "smart scanning" */ static unsigned long ksm_pages_skipped; /* Don't scan more than max pages per batch. */ static unsigned long ksm_advisor_max_pages_to_scan = 30000; /* Min CPU for scanning pages per scan */ #define KSM_ADVISOR_MIN_CPU 10 /* Max CPU for scanning pages per scan */ static unsigned int ksm_advisor_max_cpu = 70; /* Target scan time in seconds to analyze all KSM candidate pages. */ static unsigned long ksm_advisor_target_scan_time = 200; /* Exponentially weighted moving average. */ #define EWMA_WEIGHT 30 /** * struct advisor_ctx - metadata for KSM advisor * @start_scan: start time of the current scan * @scan_time: scan time of previous scan * @change: change in percent to pages_to_scan parameter * @cpu_time: cpu time consumed by the ksmd thread in the previous scan */ struct advisor_ctx { ktime_t start_scan; unsigned long scan_time; unsigned long change; unsigned long long cpu_time; }; static struct advisor_ctx advisor_ctx; /* Define different advisor's */ enum ksm_advisor_type { KSM_ADVISOR_NONE, KSM_ADVISOR_SCAN_TIME, }; static enum ksm_advisor_type ksm_advisor; #ifdef CONFIG_SYSFS /* * Only called through the sysfs control interface: */ /* At least scan this many pages per batch. */ static unsigned long ksm_advisor_min_pages_to_scan = 500; static void set_advisor_defaults(void) { if (ksm_advisor == KSM_ADVISOR_NONE) { ksm_thread_pages_to_scan = DEFAULT_PAGES_TO_SCAN; } else if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) { advisor_ctx = (const struct advisor_ctx){ 0 }; ksm_thread_pages_to_scan = ksm_advisor_min_pages_to_scan; } } #endif /* CONFIG_SYSFS */ static inline void advisor_start_scan(void) { if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) advisor_ctx.start_scan = ktime_get(); } /* * Use previous scan time if available, otherwise use current scan time as an * approximation for the previous scan time. */ static inline unsigned long prev_scan_time(struct advisor_ctx *ctx, unsigned long scan_time) { return ctx->scan_time ? ctx->scan_time : scan_time; } /* Calculate exponential weighted moving average */ static unsigned long ewma(unsigned long prev, unsigned long curr) { return ((100 - EWMA_WEIGHT) * prev + EWMA_WEIGHT * curr) / 100; } /* * The scan time advisor is based on the current scan rate and the target * scan rate. * * new_pages_to_scan = pages_to_scan * (scan_time / target_scan_time) * * To avoid perturbations it calculates a change factor of previous changes. * A new change factor is calculated for each iteration and it uses an * exponentially weighted moving average. The new pages_to_scan value is * multiplied with that change factor: * * new_pages_to_scan *= change facor * * The new_pages_to_scan value is limited by the cpu min and max values. It * calculates the cpu percent for the last scan and calculates the new * estimated cpu percent cost for the next scan. That value is capped by the * cpu min and max setting. * * In addition the new pages_to_scan value is capped by the max and min * limits. */ static void scan_time_advisor(void) { unsigned int cpu_percent; unsigned long cpu_time; unsigned long cpu_time_diff; unsigned long cpu_time_diff_ms; unsigned long pages; unsigned long per_page_cost; unsigned long factor; unsigned long change; unsigned long last_scan_time; unsigned long scan_time; /* Convert scan time to seconds */ scan_time = div_s64(ktime_ms_delta(ktime_get(), advisor_ctx.start_scan), MSEC_PER_SEC); scan_time = scan_time ? scan_time : 1; /* Calculate CPU consumption of ksmd background thread */ cpu_time = task_sched_runtime(current); cpu_time_diff = cpu_time - advisor_ctx.cpu_time; cpu_time_diff_ms = cpu_time_diff / 1000 / 1000; cpu_percent = (cpu_time_diff_ms * 100) / (scan_time * 1000); cpu_percent = cpu_percent ? cpu_percent : 1; last_scan_time = prev_scan_time(&advisor_ctx, scan_time); /* Calculate scan time as percentage of target scan time */ factor = ksm_advisor_target_scan_time * 100 / scan_time; factor = factor ? factor : 1; /* * Calculate scan time as percentage of last scan time and use * exponentially weighted average to smooth it */ change = scan_time * 100 / last_scan_time; change = change ? change : 1; change = ewma(advisor_ctx.change, change); /* Calculate new scan rate based on target scan rate. */ pages = ksm_thread_pages_to_scan * 100 / factor; /* Update pages_to_scan by weighted change percentage. */ pages = pages * change / 100; /* Cap new pages_to_scan value */ per_page_cost = ksm_thread_pages_to_scan / cpu_percent; per_page_cost = per_page_cost ? per_page_cost : 1; pages = min(pages, per_page_cost * ksm_advisor_max_cpu); pages = max(pages, per_page_cost * KSM_ADVISOR_MIN_CPU); pages = min(pages, ksm_advisor_max_pages_to_scan); /* Update advisor context */ advisor_ctx.change = change; advisor_ctx.scan_time = scan_time; advisor_ctx.cpu_time = cpu_time; ksm_thread_pages_to_scan = pages; trace_ksm_advisor(scan_time, pages, cpu_percent); } static void advisor_stop_scan(void) { if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) scan_time_advisor(); } #ifdef CONFIG_NUMA /* Zeroed when merging across nodes is not allowed */ static unsigned int ksm_merge_across_nodes = 1; static int ksm_nr_node_ids = 1; #else #define ksm_merge_across_nodes 1U #define ksm_nr_node_ids 1 #endif #define KSM_RUN_STOP 0 #define KSM_RUN_MERGE 1 #define KSM_RUN_UNMERGE 2 #define KSM_RUN_OFFLINE 4 static unsigned long ksm_run = KSM_RUN_STOP; static void wait_while_offlining(void); static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait); static DEFINE_MUTEX(ksm_thread_mutex); static DEFINE_SPINLOCK(ksm_mmlist_lock); static int __init ksm_slab_init(void) { rmap_item_cache = KMEM_CACHE(ksm_rmap_item, 0); if (!rmap_item_cache) goto out; stable_node_cache = KMEM_CACHE(ksm_stable_node, 0); if (!stable_node_cache) goto out_free1; mm_slot_cache = KMEM_CACHE(ksm_mm_slot, 0); if (!mm_slot_cache) goto out_free2; return 0; out_free2: kmem_cache_destroy(stable_node_cache); out_free1: kmem_cache_destroy(rmap_item_cache); out: return -ENOMEM; } static void __init ksm_slab_free(void) { kmem_cache_destroy(mm_slot_cache); kmem_cache_destroy(stable_node_cache); kmem_cache_destroy(rmap_item_cache); mm_slot_cache = NULL; } static __always_inline bool is_stable_node_chain(struct ksm_stable_node *chain) { return chain->rmap_hlist_len == STABLE_NODE_CHAIN; } static __always_inline bool is_stable_node_dup(struct ksm_stable_node *dup) { return dup->head == STABLE_NODE_DUP_HEAD; } static inline void stable_node_chain_add_dup(struct ksm_stable_node *dup, struct ksm_stable_node *chain) { VM_BUG_ON(is_stable_node_dup(dup)); dup->head = STABLE_NODE_DUP_HEAD; VM_BUG_ON(!is_stable_node_chain(chain)); hlist_add_head(&dup->hlist_dup, &chain->hlist); ksm_stable_node_dups++; } static inline void __stable_node_dup_del(struct ksm_stable_node *dup) { VM_BUG_ON(!is_stable_node_dup(dup)); hlist_del(&dup->hlist_dup); ksm_stable_node_dups--; } static inline void stable_node_dup_del(struct ksm_stable_node *dup) { VM_BUG_ON(is_stable_node_chain(dup)); if (is_stable_node_dup(dup)) __stable_node_dup_del(dup); else rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid)); #ifdef CONFIG_DEBUG_VM dup->head = NULL; #endif } static inline struct ksm_rmap_item *alloc_rmap_item(void) { struct ksm_rmap_item *rmap_item; rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); if (rmap_item) ksm_rmap_items++; return rmap_item; } static inline void free_rmap_item(struct ksm_rmap_item *rmap_item) { ksm_rmap_items--; rmap_item->mm->ksm_rmap_items--; rmap_item->mm = NULL; /* debug safety */ kmem_cache_free(rmap_item_cache, rmap_item); } static inline struct ksm_stable_node *alloc_stable_node(void) { /* * The allocation can take too long with GFP_KERNEL when memory is under * pressure, which may lead to hung task warnings. Adding __GFP_HIGH * grants access to memory reserves, helping to avoid this problem. */ return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH); } static inline void free_stable_node(struct ksm_stable_node *stable_node) { VM_BUG_ON(stable_node->rmap_hlist_len && !is_stable_node_chain(stable_node)); kmem_cache_free(stable_node_cache, stable_node); } /* * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's * page tables after it has passed through ksm_exit() - which, if necessary, * takes mmap_lock briefly to serialize against them. ksm_exit() does not set * a special flag: they can just back out as soon as mm_users goes to zero. * ksm_test_exit() is used throughout to make this test for exit: in some * places for correctness, in some places just to avoid unnecessary work. */ static inline bool ksm_test_exit(struct mm_struct *mm) { return atomic_read(&mm->mm_users) == 0; } /* * We use break_ksm to break COW on a ksm page by triggering unsharing, * such that the ksm page will get replaced by an exclusive anonymous page. * * We take great care only to touch a ksm page, in a VM_MERGEABLE vma, * in case the application has unmapped and remapped mm,addr meanwhile. * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP * mmap of /dev/mem, where we would not want to touch it. * * FAULT_FLAG_REMOTE/FOLL_REMOTE are because we do this outside the context * of the process that owns 'vma'. We also do not want to enforce * protection keys here anyway. */ static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma) { vm_fault_t ret = 0; if (lock_vma) vma_start_write(vma); do { bool ksm_page = false; struct folio_walk fw; struct folio *folio; cond_resched(); folio = folio_walk_start(&fw, vma, addr, FW_MIGRATION | FW_ZEROPAGE); if (folio) { /* Small folio implies FW_LEVEL_PTE. */ if (!folio_test_large(folio) && (folio_test_ksm(folio) || is_ksm_zero_pte(fw.pte))) ksm_page = true; folio_walk_end(&fw, vma); } if (!ksm_page) return 0; ret = handle_mm_fault(vma, addr, FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, NULL); } while (!(ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); /* * We must loop until we no longer find a KSM page because * handle_mm_fault() may back out if there's any difficulty e.g. if * pte accessed bit gets updated concurrently. * * VM_FAULT_SIGBUS could occur if we race with truncation of the * backing file, which also invalidates anonymous pages: that's * okay, that truncation will have unmapped the KSM page for us. * * VM_FAULT_OOM: at the time of writing (late July 2009), setting * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the * current task has TIF_MEMDIE set, and will be OOM killed on return * to user; and ksmd, having no mm, would never be chosen for that. * * But if the mm is in a limited mem_cgroup, then the fault may fail * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and * even ksmd can fail in this way - though it's usually breaking ksm * just to undo a merge it made a moment before, so unlikely to oom. * * That's a pity: we might therefore have more kernel pages allocated * than we're counting as nodes in the stable tree; but ksm_do_scan * will retry to break_cow on each pass, so should recover the page * in due course. The important thing is to not let VM_MERGEABLE * be cleared while any such pages might remain in the area. */ return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; } static bool ksm_compatible(const struct file *file, vm_flags_t vm_flags) { if (vm_flags & (VM_SHARED | VM_MAYSHARE | VM_SPECIAL | VM_HUGETLB | VM_DROPPABLE)) return false; /* just ignore the advice */ if (file_is_dax(file)) return false; #ifdef VM_SAO if (vm_flags & VM_SAO) return false; #endif #ifdef VM_SPARC_ADI if (vm_flags & VM_SPARC_ADI) return false; #endif return true; } static bool vma_ksm_compatible(struct vm_area_struct *vma) { return ksm_compatible(vma->vm_file, vma->vm_flags); } static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma; if (ksm_test_exit(mm)) return NULL; vma = vma_lookup(mm, addr); if (!vma || !(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) return NULL; return vma; } static void break_cow(struct ksm_rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; unsigned long addr = rmap_item->address; struct vm_area_struct *vma; /* * It is not an accident that whenever we want to break COW * to undo, we also need to drop a reference to the anon_vma. */ put_anon_vma(rmap_item->anon_vma); mmap_read_lock(mm); vma = find_mergeable_vma(mm, addr); if (vma) break_ksm(vma, addr, false); mmap_read_unlock(mm); } static struct page *get_mergeable_page(struct ksm_rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; unsigned long addr = rmap_item->address; struct vm_area_struct *vma; struct page *page = NULL; struct folio_walk fw; struct folio *folio; mmap_read_lock(mm); vma = find_mergeable_vma(mm, addr); if (!vma) goto out; folio = folio_walk_start(&fw, vma, addr, 0); if (folio) { if (!folio_is_zone_device(folio) && folio_test_anon(folio)) { folio_get(folio); page = fw.page; } folio_walk_end(&fw, vma); } out: if (page) { flush_anon_page(vma, page, addr); flush_dcache_page(page); } mmap_read_unlock(mm); return page; } /* * This helper is used for getting right index into array of tree roots. * When merge_across_nodes knob is set to 1, there are only two rb-trees for * stable and unstable pages from all nodes with roots in index 0. Otherwise, * every node has its own stable and unstable tree. */ static inline int get_kpfn_nid(unsigned long kpfn) { return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn)); } static struct ksm_stable_node *alloc_stable_node_chain(struct ksm_stable_node *dup, struct rb_root *root) { struct ksm_stable_node *chain = alloc_stable_node(); VM_BUG_ON(is_stable_node_chain(dup)); if (likely(chain)) { INIT_HLIST_HEAD(&chain->hlist); chain->chain_prune_time = jiffies; chain->rmap_hlist_len = STABLE_NODE_CHAIN; #if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA) chain->nid = NUMA_NO_NODE; /* debug */ #endif ksm_stable_node_chains++; /* * Put the stable node chain in the first dimension of * the stable tree and at the same time remove the old * stable node. */ rb_replace_node(&dup->node, &chain->node, root); /* * Move the old stable node to the second dimension * queued in the hlist_dup. The invariant is that all * dup stable_nodes in the chain->hlist point to pages * that are write protected and have the exact same * content. */ stable_node_chain_add_dup(dup, chain); } return chain; } static inline void free_stable_node_chain(struct ksm_stable_node *chain, struct rb_root *root) { rb_erase(&chain->node, root); free_stable_node(chain); ksm_stable_node_chains--; } static void remove_node_from_stable_tree(struct ksm_stable_node *stable_node) { struct ksm_rmap_item *rmap_item; /* check it's not STABLE_NODE_CHAIN or negative */ BUG_ON(stable_node->rmap_hlist_len < 0); hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { if (rmap_item->hlist.next) { ksm_pages_sharing--; trace_ksm_remove_rmap_item(stable_node->kpfn, rmap_item, rmap_item->mm); } else { ksm_pages_shared--; } rmap_item->mm->ksm_merging_pages--; VM_BUG_ON(stable_node->rmap_hlist_len <= 0); stable_node->rmap_hlist_len--; put_anon_vma(rmap_item->anon_vma); rmap_item->address &= PAGE_MASK; cond_resched(); } /* * We need the second aligned pointer of the migrate_nodes * list_head to stay clear from the rb_parent_color union * (aligned and different than any node) and also different * from &migrate_nodes. This will verify that future list.h changes * don't break STABLE_NODE_DUP_HEAD. Only recent gcc can handle it. */ BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes); BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1); trace_ksm_remove_ksm_page(stable_node->kpfn); if (stable_node->head == &migrate_nodes) list_del(&stable_node->list); else stable_node_dup_del(stable_node); free_stable_node(stable_node); } enum ksm_get_folio_flags { KSM_GET_FOLIO_NOLOCK, KSM_GET_FOLIO_LOCK, KSM_GET_FOLIO_TRYLOCK }; /* * ksm_get_folio: checks if the page indicated by the stable node * is still its ksm page, despite having held no reference to it. * In which case we can trust the content of the page, and it * returns the gotten page; but if the page has now been zapped, * remove the stale node from the stable tree and return NULL. * But beware, the stable node's page might be being migrated. * * You would expect the stable_node to hold a reference to the ksm page. * But if it increments the page's count, swapping out has to wait for * ksmd to come around again before it can free the page, which may take * seconds or even minutes: much too unresponsive. So instead we use a * "keyhole reference": access to the ksm page from the stable node peeps * out through its keyhole to see if that page still holds the right key, * pointing back to this stable node. This relies on freeing a PageAnon * page to reset its page->mapping to NULL, and relies on no other use of * a page to put something that might look like our key in page->mapping. * is on its way to being freed; but it is an anomaly to bear in mind. */ static struct folio *ksm_get_folio(struct ksm_stable_node *stable_node, enum ksm_get_folio_flags flags) { struct folio *folio; void *expected_mapping; unsigned long kpfn; expected_mapping = (void *)((unsigned long)stable_node | FOLIO_MAPPING_KSM); again: kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */ folio = pfn_folio(kpfn); if (READ_ONCE(folio->mapping) != expected_mapping) goto stale; /* * We cannot do anything with the page while its refcount is 0. * Usually 0 means free, or tail of a higher-order page: in which * case this node is no longer referenced, and should be freed; * however, it might mean that the page is under page_ref_freeze(). * The __remove_mapping() case is easy, again the node is now stale; * the same is in reuse_ksm_page() case; but if page is swapcache * in folio_migrate_mapping(), it might still be our page, * in which case it's essential to keep the node. */ while (!folio_try_get(folio)) { /* * Another check for folio->mapping != expected_mapping * would work here too. We have chosen to test the * swapcache flag to optimize the common case, when the * folio is or is about to be freed: the swapcache flag * is cleared (under spin_lock_irq) in the ref_freeze * section of __remove_mapping(); but anon folio->mapping * is reset to NULL later, in free_pages_prepare(). */ if (!folio_test_swapcache(folio)) goto stale; cpu_relax(); } if (READ_ONCE(folio->mapping) != expected_mapping) { folio_put(folio); goto stale; } if (flags == KSM_GET_FOLIO_TRYLOCK) { if (!folio_trylock(folio)) { folio_put(folio); return ERR_PTR(-EBUSY); } } else if (flags == KSM_GET_FOLIO_LOCK) folio_lock(folio); if (flags != KSM_GET_FOLIO_NOLOCK) { if (READ_ONCE(folio->mapping) != expected_mapping) { folio_unlock(folio); folio_put(folio); goto stale; } } return folio; stale: /* * We come here from above when folio->mapping or the swapcache flag * suggests that the node is stale; but it might be under migration. * We need smp_rmb(), matching the smp_wmb() in folio_migrate_ksm(), * before checking whether node->kpfn has been changed. */ smp_rmb(); if (READ_ONCE(stable_node->kpfn) != kpfn) goto again; remove_node_from_stable_tree(stable_node); return NULL; } /* * Removing rmap_item from stable or unstable tree. * This function will clean the information from the stable/unstable tree. */ static void remove_rmap_item_from_tree(struct ksm_rmap_item *rmap_item) { if (rmap_item->address & STABLE_FLAG) { struct ksm_stable_node *stable_node; struct folio *folio; stable_node = rmap_item->head; folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_LOCK); if (!folio) goto out; hlist_del(&rmap_item->hlist); folio_unlock(folio); folio_put(folio); if (!hlist_empty(&stable_node->hlist)) ksm_pages_sharing--; else ksm_pages_shared--; rmap_item->mm->ksm_merging_pages--; VM_BUG_ON(stable_node->rmap_hlist_len <= 0); stable_node->rmap_hlist_len--; put_anon_vma(rmap_item->anon_vma); rmap_item->head = NULL; rmap_item->address &= PAGE_MASK; } else if (rmap_item->address & UNSTABLE_FLAG) { unsigned char age; /* * Usually ksmd can and must skip the rb_erase, because * root_unstable_tree was already reset to RB_ROOT. * But be careful when an mm is exiting: do the rb_erase * if this rmap_item was inserted by this scan, rather * than left over from before. */ age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); BUG_ON(age > 1); if (!age) rb_erase(&rmap_item->node, root_unstable_tree + NUMA(rmap_item->nid)); ksm_pages_unshared--; rmap_item->address &= PAGE_MASK; } out: cond_resched(); /* we're called from many long loops */ } static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list) { while (*rmap_list) { struct ksm_rmap_item *rmap_item = *rmap_list; *rmap_list = rmap_item->rmap_list; remove_rmap_item_from_tree(rmap_item); free_rmap_item(rmap_item); } } /* * Though it's very tempting to unmerge rmap_items from stable tree rather * than check every pte of a given vma, the locking doesn't quite work for * that - an rmap_item is assigned to the stable tree after inserting ksm * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing * rmap_items from parent to child at fork time (so as not to waste time * if exit comes before the next scan reaches it). * * Similarly, although we'd like to remove rmap_items (so updating counts * and freeing memory) when unmerging an area, it's easier to leave that * to the next pass of ksmd - consider, for example, how ksmd might be * in cmp_and_merge_page on one of the rmap_items we would be removing. */ static int unmerge_ksm_pages(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool lock_vma) { unsigned long addr; int err = 0; for (addr = start; addr < end && !err; addr += PAGE_SIZE) { if (ksm_test_exit(vma->vm_mm)) break; if (signal_pending(current)) err = -ERESTARTSYS; else err = break_ksm(vma, addr, lock_vma); } return err; } static inline struct ksm_stable_node *folio_stable_node(const struct folio *folio) { return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL; } static inline void folio_set_stable_node(struct folio *folio, struct ksm_stable_node *stable_node) { VM_WARN_ON_FOLIO(folio_test_anon(folio) && PageAnonExclusive(&folio->page), folio); folio->mapping = (void *)((unsigned long)stable_node | FOLIO_MAPPING_KSM); } #ifdef CONFIG_SYSFS /* * Only called through the sysfs control interface: */ static int remove_stable_node(struct ksm_stable_node *stable_node) { struct folio *folio; int err; folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_LOCK); if (!folio) { /* * ksm_get_folio did remove_node_from_stable_tree itself. */ return 0; } /* * Page could be still mapped if this races with __mmput() running in * between ksm_exit() and exit_mmap(). Just refuse to let * merge_across_nodes/max_page_sharing be switched. */ err = -EBUSY; if (!folio_mapped(folio)) { /* * The stable node did not yet appear stale to ksm_get_folio(), * since that allows for an unmapped ksm folio to be recognized * right up until it is freed; but the node is safe to remove. * This folio might be in an LRU cache waiting to be freed, * or it might be in the swapcache (perhaps under writeback), * or it might have been removed from swapcache a moment ago. */ folio_set_stable_node(folio, NULL); remove_node_from_stable_tree(stable_node); err = 0; } folio_unlock(folio); folio_put(folio); return err; } static int remove_stable_node_chain(struct ksm_stable_node *stable_node, struct rb_root *root) { struct ksm_stable_node *dup; struct hlist_node *hlist_safe; if (!is_stable_node_chain(stable_node)) { VM_BUG_ON(is_stable_node_dup(stable_node)); if (remove_stable_node(stable_node)) return true; else return false; } hlist_for_each_entry_safe(dup, hlist_safe, &stable_node->hlist, hlist_dup) { VM_BUG_ON(!is_stable_node_dup(dup)); if (remove_stable_node(dup)) return true; } BUG_ON(!hlist_empty(&stable_node->hlist)); free_stable_node_chain(stable_node, root); return false; } static int remove_all_stable_nodes(void) { struct ksm_stable_node *stable_node, *next; int nid; int err = 0; for (nid = 0; nid < ksm_nr_node_ids; nid++) { while (root_stable_tree[nid].rb_node) { stable_node = rb_entry(root_stable_tree[nid].rb_node, struct ksm_stable_node, node); if (remove_stable_node_chain(stable_node, root_stable_tree + nid)) { err = -EBUSY; break; /* proceed to next nid */ } cond_resched(); } } list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { if (remove_stable_node(stable_node)) err = -EBUSY; cond_resched(); } return err; } static int unmerge_and_remove_all_rmap_items(void) { struct ksm_mm_slot *mm_slot; struct mm_slot *slot; struct mm_struct *mm; struct vm_area_struct *vma; int err = 0; spin_lock(&ksm_mmlist_lock); slot = list_entry(ksm_mm_head.slot.mm_node.next, struct mm_slot, mm_node); ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); spin_unlock(&ksm_mmlist_lock); for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) { VMA_ITERATOR(vmi, mm_slot->slot.mm, 0); mm = mm_slot->slot.mm; mmap_read_lock(mm); /* * Exit right away if mm is exiting to avoid lockdep issue in * the maple tree */ if (ksm_test_exit(mm)) goto mm_exiting; for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) continue; err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end, false); if (err) goto error; } mm_exiting: remove_trailing_rmap_items(&mm_slot->rmap_list); mmap_read_unlock(mm); spin_lock(&ksm_mmlist_lock); slot = list_entry(mm_slot->slot.mm_node.next, struct mm_slot, mm_node); ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); if (ksm_test_exit(mm)) { hash_del(&mm_slot->slot.hash); list_del(&mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); mm_slot_free(mm_slot_cache, mm_slot); mm_flags_clear(MMF_VM_MERGEABLE, mm); mm_flags_clear(MMF_VM_MERGE_ANY, mm); mmdrop(mm); } else spin_unlock(&ksm_mmlist_lock); } /* Clean up stable nodes, but don't worry if some are still busy */ remove_all_stable_nodes(); ksm_scan.seqnr = 0; return 0; error: mmap_read_unlock(mm); spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = &ksm_mm_head; spin_unlock(&ksm_mmlist_lock); return err; } #endif /* CONFIG_SYSFS */ static u32 calc_checksum(struct page *page) { u32 checksum; void *addr = kmap_local_page(page); checksum = xxhash(addr, PAGE_SIZE, 0); kunmap_local(addr); return checksum; } static int write_protect_page(struct vm_area_struct *vma, struct folio *folio, pte_t *orig_pte) { struct mm_struct *mm = vma->vm_mm; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, 0, 0); int swapped; int err = -EFAULT; struct mmu_notifier_range range; bool anon_exclusive; pte_t entry; if (WARN_ON_ONCE(folio_test_large(folio))) return err; pvmw.address = page_address_in_vma(folio, folio_page(folio, 0), vma); if (pvmw.address == -EFAULT) goto out; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, pvmw.address, pvmw.address + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); if (!page_vma_mapped_walk(&pvmw)) goto out_mn; if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?")) goto out_unlock; entry = ptep_get(pvmw.pte); /* * Handle PFN swap PTEs, such as device-exclusive ones, that actually * map pages: give up just like the next folio_walk would. */ if (unlikely(!pte_present(entry))) goto out_unlock; anon_exclusive = PageAnonExclusive(&folio->page); if (pte_write(entry) || pte_dirty(entry) || anon_exclusive || mm_tlb_flush_pending(mm)) { swapped = folio_test_swapcache(folio); flush_cache_page(vma, pvmw.address, folio_pfn(folio)); /* * Ok this is tricky, when get_user_pages_fast() run it doesn't * take any lock, therefore the check that we are going to make * with the pagecount against the mapcount is racy and * O_DIRECT can happen right after the check. * So we clear the pte and flush the tlb before the check * this assure us that no O_DIRECT can happen after the check * or in the middle of the check. * * No need to notify as we are downgrading page table to read * only not changing it to point to a new page. * * See Documentation/mm/mmu_notifier.rst */ entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte); /* * Check that no O_DIRECT or similar I/O is in progress on the * page */ if (folio_mapcount(folio) + 1 + swapped != folio_ref_count(folio)) { set_pte_at(mm, pvmw.address, pvmw.pte, entry); goto out_unlock; } /* See folio_try_share_anon_rmap_pte(): clear PTE first. */ if (anon_exclusive && folio_try_share_anon_rmap_pte(folio, &folio->page)) { set_pte_at(mm, pvmw.address, pvmw.pte, entry); goto out_unlock; } if (pte_dirty(entry)) folio_mark_dirty(folio); entry = pte_mkclean(entry); if (pte_write(entry)) entry = pte_wrprotect(entry); set_pte_at(mm, pvmw.address, pvmw.pte, entry); } *orig_pte = entry; err = 0; out_unlock: page_vma_mapped_walk_done(&pvmw); out_mn: mmu_notifier_invalidate_range_end(&range); out: return err; } /** * replace_page - replace page in vma by new ksm page * @vma: vma that holds the pte pointing to page * @page: the page we are replacing by kpage * @kpage: the ksm page we replace page by * @orig_pte: the original value of the pte * * Returns 0 on success, -EFAULT on failure. */ static int replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage, pte_t orig_pte) { struct folio *kfolio = page_folio(kpage); struct mm_struct *mm = vma->vm_mm; struct folio *folio = page_folio(page); pmd_t *pmd; pmd_t pmde; pte_t *ptep; pte_t newpte; spinlock_t *ptl; unsigned long addr; int err = -EFAULT; struct mmu_notifier_range range; addr = page_address_in_vma(folio, page, vma); if (addr == -EFAULT) goto out; pmd = mm_find_pmd(mm, addr); if (!pmd) goto out; /* * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() * without holding anon_vma lock for write. So when looking for a * genuine pmde (in which to find pte), test present and !THP together. */ pmde = pmdp_get_lockless(pmd); if (!pmd_present(pmde) || pmd_trans_huge(pmde)) goto out; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, addr + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!ptep) goto out_mn; if (!pte_same(ptep_get(ptep), orig_pte)) { pte_unmap_unlock(ptep, ptl); goto out_mn; } VM_BUG_ON_PAGE(PageAnonExclusive(page), page); VM_BUG_ON_FOLIO(folio_test_anon(kfolio) && PageAnonExclusive(kpage), kfolio); /* * No need to check ksm_use_zero_pages here: we can only have a * zero_page here if ksm_use_zero_pages was enabled already. */ if (!is_zero_pfn(page_to_pfn(kpage))) { folio_get(kfolio); folio_add_anon_rmap_pte(kfolio, kpage, vma, addr, RMAP_NONE); newpte = mk_pte(kpage, vma->vm_page_prot); } else { /* * Use pte_mkdirty to mark the zero page mapped by KSM, and then * we can easily track all KSM-placed zero pages by checking if * the dirty bit in zero page's PTE is set. */ newpte = pte_mkdirty(pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot))); ksm_map_zero_page(mm); /* * We're replacing an anonymous page with a zero page, which is * not anonymous. We need to do proper accounting otherwise we * will get wrong values in /proc, and a BUG message in dmesg * when tearing down the mm. */ dec_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, addr, pte_pfn(ptep_get(ptep))); /* * No need to notify as we are replacing a read only page with another * read only page with the same content. * * See Documentation/mm/mmu_notifier.rst */ ptep_clear_flush(vma, addr, ptep); set_pte_at(mm, addr, ptep, newpte); folio_remove_rmap_pte(folio, page, vma); if (!folio_mapped(folio)) folio_free_swap(folio); folio_put(folio); pte_unmap_unlock(ptep, ptl); err = 0; out_mn: mmu_notifier_invalidate_range_end(&range); out: return err; } /* * try_to_merge_one_page - take two pages and merge them into one * @vma: the vma that holds the pte pointing to page * @page: the PageAnon page that we want to replace with kpage * @kpage: the KSM page that we want to map instead of page, * or NULL the first time when we want to use page as kpage. * * This function returns 0 if the pages were merged, -EFAULT otherwise. */ static int try_to_merge_one_page(struct vm_area_struct *vma, struct page *page, struct page *kpage) { struct folio *folio = page_folio(page); pte_t orig_pte = __pte(0); int err = -EFAULT; if (page == kpage) /* ksm page forked */ return 0; if (!folio_test_anon(folio)) goto out; /* * We need the folio lock to read a stable swapcache flag in * write_protect_page(). We trylock because we don't want to wait * here - we prefer to continue scanning and merging different * pages, then come back to this page when it is unlocked. */ if (!folio_trylock(folio)) goto out; if (folio_test_large(folio)) { if (split_huge_page(page)) goto out_unlock; folio = page_folio(page); } /* * If this anonymous page is mapped only here, its pte may need * to be write-protected. If it's mapped elsewhere, all of its * ptes are necessarily already write-protected. But in either * case, we need to lock and check page_count is not raised. */ if (write_protect_page(vma, folio, &orig_pte) == 0) { if (!kpage) { /* * While we hold folio lock, upgrade folio from * anon to a NULL stable_node with the KSM flag set: * stable_tree_insert() will update stable_node. */ folio_set_stable_node(folio, NULL); folio_mark_accessed(folio); /* * Page reclaim just frees a clean folio with no dirty * ptes: make sure that the ksm page would be swapped. */ if (!folio_test_dirty(folio)) folio_mark_dirty(folio); err = 0; } else if (pages_identical(page, kpage)) err = replace_page(vma, page, kpage, orig_pte); } out_unlock: folio_unlock(folio); out: return err; } /* * This function returns 0 if the pages were merged or if they are * no longer merging candidates (e.g., VMA stale), -EFAULT otherwise. */ static int try_to_merge_with_zero_page(struct ksm_rmap_item *rmap_item, struct page *page) { struct mm_struct *mm = rmap_item->mm; int err = -EFAULT; /* * Same checksum as an empty page. We attempt to merge it with the * appropriate zero page if the user enabled this via sysfs. */ if (ksm_use_zero_pages && (rmap_item->oldchecksum == zero_checksum)) { struct vm_area_struct *vma; mmap_read_lock(mm); vma = find_mergeable_vma(mm, rmap_item->address); if (vma) { err = try_to_merge_one_page(vma, page, ZERO_PAGE(rmap_item->address)); trace_ksm_merge_one_page( page_to_pfn(ZERO_PAGE(rmap_item->address)), rmap_item, mm, err); } else { /* * If the vma is out of date, we do not need to * continue. */ err = 0; } mmap_read_unlock(mm); } return err; } /* * try_to_merge_with_ksm_page - like try_to_merge_two_pages, * but no new kernel page is allocated: kpage must already be a ksm page. * * This function returns 0 if the pages were merged, -EFAULT otherwise. */ static int try_to_merge_with_ksm_page(struct ksm_rmap_item *rmap_item, struct page *page, struct page *kpage) { struct mm_struct *mm = rmap_item->mm; struct vm_area_struct *vma; int err = -EFAULT; mmap_read_lock(mm); vma = find_mergeable_vma(mm, rmap_item->address); if (!vma) goto out; err = try_to_merge_one_page(vma, page, kpage); if (err) goto out; /* Unstable nid is in union with stable anon_vma: remove first */ remove_rmap_item_from_tree(rmap_item); /* Must get reference to anon_vma while still holding mmap_lock */ rmap_item->anon_vma = vma->anon_vma; get_anon_vma(vma->anon_vma); out: mmap_read_unlock(mm); trace_ksm_merge_with_ksm_page(kpage, page_to_pfn(kpage ? kpage : page), rmap_item, mm, err); return err; } /* * try_to_merge_two_pages - take two identical pages and prepare them * to be merged into one page. * * This function returns the kpage if we successfully merged two identical * pages into one ksm page, NULL otherwise. * * Note that this function upgrades page to ksm page: if one of the pages * is already a ksm page, try_to_merge_with_ksm_page should be used. */ static struct folio *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item, struct page *page, struct ksm_rmap_item *tree_rmap_item, struct page *tree_page) { int err; err = try_to_merge_with_ksm_page(rmap_item, page, NULL); if (!err) { err = try_to_merge_with_ksm_page(tree_rmap_item, tree_page, page); /* * If that fails, we have a ksm page with only one pte * pointing to it: so break it. */ if (err) break_cow(rmap_item); } return err ? NULL : page_folio(page); } static __always_inline bool __is_page_sharing_candidate(struct ksm_stable_node *stable_node, int offset) { VM_BUG_ON(stable_node->rmap_hlist_len < 0); /* * Check that at least one mapping still exists, otherwise * there's no much point to merge and share with this * stable_node, as the underlying tree_page of the other * sharer is going to be freed soon. */ return stable_node->rmap_hlist_len && stable_node->rmap_hlist_len + offset < ksm_max_page_sharing; } static __always_inline bool is_page_sharing_candidate(struct ksm_stable_node *stable_node) { return __is_page_sharing_candidate(stable_node, 0); } static struct folio *stable_node_dup(struct ksm_stable_node **_stable_node_dup, struct ksm_stable_node **_stable_node, struct rb_root *root, bool prune_stale_stable_nodes) { struct ksm_stable_node *dup, *found = NULL, *stable_node = *_stable_node; struct hlist_node *hlist_safe; struct folio *folio, *tree_folio = NULL; int found_rmap_hlist_len; if (!prune_stale_stable_nodes || time_before(jiffies, stable_node->chain_prune_time + msecs_to_jiffies( ksm_stable_node_chains_prune_millisecs))) prune_stale_stable_nodes = false; else stable_node->chain_prune_time = jiffies; hlist_for_each_entry_safe(dup, hlist_safe, &stable_node->hlist, hlist_dup) { cond_resched(); /* * We must walk all stable_node_dup to prune the stale * stable nodes during lookup. * * ksm_get_folio can drop the nodes from the * stable_node->hlist if they point to freed pages * (that's why we do a _safe walk). The "dup" * stable_node parameter itself will be freed from * under us if it returns NULL. */ folio = ksm_get_folio(dup, KSM_GET_FOLIO_NOLOCK); if (!folio) continue; /* Pick the best candidate if possible. */ if (!found || (is_page_sharing_candidate(dup) && (!is_page_sharing_candidate(found) || dup->rmap_hlist_len > found_rmap_hlist_len))) { if (found) folio_put(tree_folio); found = dup; found_rmap_hlist_len = found->rmap_hlist_len; tree_folio = folio; /* skip put_page for found candidate */ if (!prune_stale_stable_nodes && is_page_sharing_candidate(found)) break; continue; } folio_put(folio); } if (found) { if (hlist_is_singular_node(&found->hlist_dup, &stable_node->hlist)) { /* * If there's not just one entry it would * corrupt memory, better BUG_ON. In KSM * context with no lock held it's not even * fatal. */ BUG_ON(stable_node->hlist.first->next); /* * There's just one entry and it is below the * deduplication limit so drop the chain. */ rb_replace_node(&stable_node->node, &found->node, root); free_stable_node(stable_node); ksm_stable_node_chains--; ksm_stable_node_dups--; /* * NOTE: the caller depends on the stable_node * to be equal to stable_node_dup if the chain * was collapsed. */ *_stable_node = found; /* * Just for robustness, as stable_node is * otherwise left as a stable pointer, the * compiler shall optimize it away at build * time. */ stable_node = NULL; } else if (stable_node->hlist.first != &found->hlist_dup && __is_page_sharing_candidate(found, 1)) { /* * If the found stable_node dup can accept one * more future merge (in addition to the one * that is underway) and is not at the head of * the chain, put it there so next search will * be quicker in the !prune_stale_stable_nodes * case. * * NOTE: it would be inaccurate to use nr > 1 * instead of checking the hlist.first pointer * directly, because in the * prune_stale_stable_nodes case "nr" isn't * the position of the found dup in the chain, * but the total number of dups in the chain. */ hlist_del(&found->hlist_dup); hlist_add_head(&found->hlist_dup, &stable_node->hlist); } } else { /* Its hlist must be empty if no one found. */ free_stable_node_chain(stable_node, root); } *_stable_node_dup = found; return tree_folio; } /* * Like for ksm_get_folio, this function can free the *_stable_node and * *_stable_node_dup if the returned tree_page is NULL. * * It can also free and overwrite *_stable_node with the found * stable_node_dup if the chain is collapsed (in which case * *_stable_node will be equal to *_stable_node_dup like if the chain * never existed). It's up to the caller to verify tree_page is not * NULL before dereferencing *_stable_node or *_stable_node_dup. * * *_stable_node_dup is really a second output parameter of this * function and will be overwritten in all cases, the caller doesn't * need to initialize it. */ static struct folio *__stable_node_chain(struct ksm_stable_node **_stable_node_dup, struct ksm_stable_node **_stable_node, struct rb_root *root, bool prune_stale_stable_nodes) { struct ksm_stable_node *stable_node = *_stable_node; if (!is_stable_node_chain(stable_node)) { *_stable_node_dup = stable_node; return ksm_get_folio(stable_node, KSM_GET_FOLIO_NOLOCK); } return stable_node_dup(_stable_node_dup, _stable_node, root, prune_stale_stable_nodes); } static __always_inline struct folio *chain_prune(struct ksm_stable_node **s_n_d, struct ksm_stable_node **s_n, struct rb_root *root) { return __stable_node_chain(s_n_d, s_n, root, true); } static __always_inline struct folio *chain(struct ksm_stable_node **s_n_d, struct ksm_stable_node **s_n, struct rb_root *root) { return __stable_node_chain(s_n_d, s_n, root, false); } /* * stable_tree_search - search for page inside the stable tree * * This function checks if there is a page inside the stable tree * with identical content to the page that we are scanning right now. * * This function returns the stable tree node of identical content if found, * -EBUSY if the stable node's page is being migrated, NULL otherwise. */ static struct folio *stable_tree_search(struct page *page) { int nid; struct rb_root *root; struct rb_node **new; struct rb_node *parent; struct ksm_stable_node *stable_node, *stable_node_dup; struct ksm_stable_node *page_node; struct folio *folio; folio = page_folio(page); page_node = folio_stable_node(folio); if (page_node && page_node->head != &migrate_nodes) { /* ksm page forked */ folio_get(folio); return folio; } nid = get_kpfn_nid(folio_pfn(folio)); root = root_stable_tree + nid; again: new = &root->rb_node; parent = NULL; while (*new) { struct folio *tree_folio; int ret; cond_resched(); stable_node = rb_entry(*new, struct ksm_stable_node, node); tree_folio = chain_prune(&stable_node_dup, &stable_node, root); if (!tree_folio) { /* * If we walked over a stale stable_node, * ksm_get_folio() will call rb_erase() and it * may rebalance the tree from under us. So * restart the search from scratch. Returning * NULL would be safe too, but we'd generate * false negative insertions just because some * stable_node was stale. */ goto again; } ret = memcmp_pages(page, &tree_folio->page); folio_put(tree_folio); parent = *new; if (ret < 0) new = &parent->rb_left; else if (ret > 0) new = &parent->rb_right; else { if (page_node) { VM_BUG_ON(page_node->head != &migrate_nodes); /* * If the mapcount of our migrated KSM folio is * at most 1, we can merge it with another * KSM folio where we know that we have space * for one more mapping without exceeding the * ksm_max_page_sharing limit: see * chain_prune(). This way, we can avoid adding * this stable node to the chain. */ if (folio_mapcount(folio) > 1) goto chain_append; } if (!is_page_sharing_candidate(stable_node_dup)) { /* * If the stable_node is a chain and * we got a payload match in memcmp * but we cannot merge the scanned * page in any of the existing * stable_node dups because they're * all full, we need to wait the * scanned page to find itself a match * in the unstable tree to create a * brand new KSM page to add later to * the dups of this stable_node. */ return NULL; } /* * Lock and unlock the stable_node's page (which * might already have been migrated) so that page * migration is sure to notice its raised count. * It would be more elegant to return stable_node * than kpage, but that involves more changes. */ tree_folio = ksm_get_folio(stable_node_dup, KSM_GET_FOLIO_TRYLOCK); if (PTR_ERR(tree_folio) == -EBUSY) return ERR_PTR(-EBUSY); if (unlikely(!tree_folio)) /* * The tree may have been rebalanced, * so re-evaluate parent and new. */ goto again; folio_unlock(tree_folio); if (get_kpfn_nid(stable_node_dup->kpfn) != NUMA(stable_node_dup->nid)) { folio_put(tree_folio); goto replace; } return tree_folio; } } if (!page_node) return NULL; list_del(&page_node->list); DO_NUMA(page_node->nid = nid); rb_link_node(&page_node->node, parent, new); rb_insert_color(&page_node->node, root); out: if (is_page_sharing_candidate(page_node)) { folio_get(folio); return folio; } else return NULL; replace: /* * If stable_node was a chain and chain_prune collapsed it, * stable_node has been updated to be the new regular * stable_node. A collapse of the chain is indistinguishable * from the case there was no chain in the stable * rbtree. Otherwise stable_node is the chain and * stable_node_dup is the dup to replace. */ if (stable_node_dup == stable_node) { VM_BUG_ON(is_stable_node_chain(stable_node_dup)); VM_BUG_ON(is_stable_node_dup(stable_node_dup)); /* there is no chain */ if (page_node) { VM_BUG_ON(page_node->head != &migrate_nodes); list_del(&page_node->list); DO_NUMA(page_node->nid = nid); rb_replace_node(&stable_node_dup->node, &page_node->node, root); if (is_page_sharing_candidate(page_node)) folio_get(folio); else folio = NULL; } else { rb_erase(&stable_node_dup->node, root); folio = NULL; } } else { VM_BUG_ON(!is_stable_node_chain(stable_node)); __stable_node_dup_del(stable_node_dup); if (page_node) { VM_BUG_ON(page_node->head != &migrate_nodes); list_del(&page_node->list); DO_NUMA(page_node->nid = nid); stable_node_chain_add_dup(page_node, stable_node); if (is_page_sharing_candidate(page_node)) folio_get(folio); else folio = NULL; } else { folio = NULL; } } stable_node_dup->head = &migrate_nodes; list_add(&stable_node_dup->list, stable_node_dup->head); return folio; chain_append: /* * If stable_node was a chain and chain_prune collapsed it, * stable_node has been updated to be the new regular * stable_node. A collapse of the chain is indistinguishable * from the case there was no chain in the stable * rbtree. Otherwise stable_node is the chain and * stable_node_dup is the dup to replace. */ if (stable_node_dup == stable_node) { VM_BUG_ON(is_stable_node_dup(stable_node_dup)); /* chain is missing so create it */ stable_node = alloc_stable_node_chain(stable_node_dup, root); if (!stable_node) return NULL; } /* * Add this stable_node dup that was * migrated to the stable_node chain * of the current nid for this page * content. */ VM_BUG_ON(!is_stable_node_dup(stable_node_dup)); VM_BUG_ON(page_node->head != &migrate_nodes); list_del(&page_node->list); DO_NUMA(page_node->nid = nid); stable_node_chain_add_dup(page_node, stable_node); goto out; } /* * stable_tree_insert - insert stable tree node pointing to new ksm page * into the stable tree. * * This function returns the stable tree node just allocated on success, * NULL otherwise. */ static struct ksm_stable_node *stable_tree_insert(struct folio *kfolio) { int nid; unsigned long kpfn; struct rb_root *root; struct rb_node **new; struct rb_node *parent; struct ksm_stable_node *stable_node, *stable_node_dup; bool need_chain = false; kpfn = folio_pfn(kfolio); nid = get_kpfn_nid(kpfn); root = root_stable_tree + nid; again: parent = NULL; new = &root->rb_node; while (*new) { struct folio *tree_folio; int ret; cond_resched(); stable_node = rb_entry(*new, struct ksm_stable_node, node); tree_folio = chain(&stable_node_dup, &stable_node, root); if (!tree_folio) { /* * If we walked over a stale stable_node, * ksm_get_folio() will call rb_erase() and it * may rebalance the tree from under us. So * restart the search from scratch. Returning * NULL would be safe too, but we'd generate * false negative insertions just because some * stable_node was stale. */ goto again; } ret = memcmp_pages(&kfolio->page, &tree_folio->page); folio_put(tree_folio); parent = *new; if (ret < 0) new = &parent->rb_left; else if (ret > 0) new = &parent->rb_right; else { need_chain = true; break; } } stable_node_dup = alloc_stable_node(); if (!stable_node_dup) return NULL; INIT_HLIST_HEAD(&stable_node_dup->hlist); stable_node_dup->kpfn = kpfn; stable_node_dup->rmap_hlist_len = 0; DO_NUMA(stable_node_dup->nid = nid); if (!need_chain) { rb_link_node(&stable_node_dup->node, parent, new); rb_insert_color(&stable_node_dup->node, root); } else { if (!is_stable_node_chain(stable_node)) { struct ksm_stable_node *orig = stable_node; /* chain is missing so create it */ stable_node = alloc_stable_node_chain(orig, root); if (!stable_node) { free_stable_node(stable_node_dup); return NULL; } } stable_node_chain_add_dup(stable_node_dup, stable_node); } folio_set_stable_node(kfolio, stable_node_dup); return stable_node_dup; } /* * unstable_tree_search_insert - search for identical page, * else insert rmap_item into the unstable tree. * * This function searches for a page in the unstable tree identical to the * page currently being scanned; and if no identical page is found in the * tree, we insert rmap_item as a new object into the unstable tree. * * This function returns pointer to rmap_item found to be identical * to the currently scanned page, NULL otherwise. * * This function does both searching and inserting, because they share * the same walking algorithm in an rbtree. */ static struct ksm_rmap_item *unstable_tree_search_insert(struct ksm_rmap_item *rmap_item, struct page *page, struct page **tree_pagep) { struct rb_node **new; struct rb_root *root; struct rb_node *parent = NULL; int nid; nid = get_kpfn_nid(page_to_pfn(page)); root = root_unstable_tree + nid; new = &root->rb_node; while (*new) { struct ksm_rmap_item *tree_rmap_item; struct page *tree_page; int ret; cond_resched(); tree_rmap_item = rb_entry(*new, struct ksm_rmap_item, node); tree_page = get_mergeable_page(tree_rmap_item); if (!tree_page) return NULL; /* * Don't substitute a ksm page for a forked page. */ if (page == tree_page) { put_page(tree_page); return NULL; } ret = memcmp_pages(page, tree_page); parent = *new; if (ret < 0) { put_page(tree_page); new = &parent->rb_left; } else if (ret > 0) { put_page(tree_page); new = &parent->rb_right; } else if (!ksm_merge_across_nodes && page_to_nid(tree_page) != nid) { /* * If tree_page has been migrated to another NUMA node, * it will be flushed out and put in the right unstable * tree next time: only merge with it when across_nodes. */ put_page(tree_page); return NULL; } else { *tree_pagep = tree_page; return tree_rmap_item; } } rmap_item->address |= UNSTABLE_FLAG; rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); DO_NUMA(rmap_item->nid = nid); rb_link_node(&rmap_item->node, parent, new); rb_insert_color(&rmap_item->node, root); ksm_pages_unshared++; return NULL; } /* * stable_tree_append - add another rmap_item to the linked list of * rmap_items hanging off a given node of the stable tree, all sharing * the same ksm page. */ static void stable_tree_append(struct ksm_rmap_item *rmap_item, struct ksm_stable_node *stable_node, bool max_page_sharing_bypass) { /* * rmap won't find this mapping if we don't insert the * rmap_item in the right stable_node * duplicate. page_migration could break later if rmap breaks, * so we can as well crash here. We really need to check for * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check * for other negative values as an underflow if detected here * for the first time (and not when decreasing rmap_hlist_len) * would be sign of memory corruption in the stable_node. */ BUG_ON(stable_node->rmap_hlist_len < 0); stable_node->rmap_hlist_len++; if (!max_page_sharing_bypass) /* possibly non fatal but unexpected overflow, only warn */ WARN_ON_ONCE(stable_node->rmap_hlist_len > ksm_max_page_sharing); rmap_item->head = stable_node; rmap_item->address |= STABLE_FLAG; hlist_add_head(&rmap_item->hlist, &stable_node->hlist); if (rmap_item->hlist.next) ksm_pages_sharing++; else ksm_pages_shared++; rmap_item->mm->ksm_merging_pages++; } /* * cmp_and_merge_page - first see if page can be merged into the stable tree; * if not, compare checksum to previous and if it's the same, see if page can * be inserted into the unstable tree, or merged with a page already there and * both transferred to the stable tree. * * @page: the page that we are searching identical page to. * @rmap_item: the reverse mapping into the virtual address of this page */ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item) { struct folio *folio = page_folio(page); struct ksm_rmap_item *tree_rmap_item; struct page *tree_page = NULL; struct ksm_stable_node *stable_node; struct folio *kfolio; unsigned int checksum; int err; bool max_page_sharing_bypass = false; stable_node = folio_stable_node(folio); if (stable_node) { if (stable_node->head != &migrate_nodes && get_kpfn_nid(READ_ONCE(stable_node->kpfn)) != NUMA(stable_node->nid)) { stable_node_dup_del(stable_node); stable_node->head = &migrate_nodes; list_add(&stable_node->list, stable_node->head); } if (stable_node->head != &migrate_nodes && rmap_item->head == stable_node) return; /* * If it's a KSM fork, allow it to go over the sharing limit * without warnings. */ if (!is_page_sharing_candidate(stable_node)) max_page_sharing_bypass = true; } else { remove_rmap_item_from_tree(rmap_item); /* * If the hash value of the page has changed from the last time * we calculated it, this page is changing frequently: therefore we * don't want to insert it in the unstable tree, and we don't want * to waste our time searching for something identical to it there. */ checksum = calc_checksum(page); if (rmap_item->oldchecksum != checksum) { rmap_item->oldchecksum = checksum; return; } if (!try_to_merge_with_zero_page(rmap_item, page)) return; } /* Start by searching for the folio in the stable tree */ kfolio = stable_tree_search(page); if (kfolio == folio && rmap_item->head == stable_node) { folio_put(kfolio); return; } remove_rmap_item_from_tree(rmap_item); if (kfolio) { if (kfolio == ERR_PTR(-EBUSY)) return; err = try_to_merge_with_ksm_page(rmap_item, page, &kfolio->page); if (!err) { /* * The page was successfully merged: * add its rmap_item to the stable tree. */ folio_lock(kfolio); stable_tree_append(rmap_item, folio_stable_node(kfolio), max_page_sharing_bypass); folio_unlock(kfolio); } folio_put(kfolio); return; } tree_rmap_item = unstable_tree_search_insert(rmap_item, page, &tree_page); if (tree_rmap_item) { bool split; kfolio = try_to_merge_two_pages(rmap_item, page, tree_rmap_item, tree_page); /* * If both pages we tried to merge belong to the same compound * page, then we actually ended up increasing the reference * count of the same compound page twice, and split_huge_page * failed. * Here we set a flag if that happened, and we use it later to * try split_huge_page again. Since we call put_page right * afterwards, the reference count will be correct and * split_huge_page should succeed. */ split = PageTransCompound(page) && compound_head(page) == compound_head(tree_page); put_page(tree_page); if (kfolio) { /* * The pages were successfully merged: insert new * node in the stable tree and add both rmap_items. */ folio_lock(kfolio); stable_node = stable_tree_insert(kfolio); if (stable_node) { stable_tree_append(tree_rmap_item, stable_node, false); stable_tree_append(rmap_item, stable_node, false); } folio_unlock(kfolio); /* * If we fail to insert the page into the stable tree, * we will have 2 virtual addresses that are pointing * to a ksm page left outside the stable tree, * in which case we need to break_cow on both. */ if (!stable_node) { break_cow(tree_rmap_item); break_cow(rmap_item); } } else if (split) { /* * We are here if we tried to merge two pages and * failed because they both belonged to the same * compound page. We will split the page now, but no * merging will take place. * We do not want to add the cost of a full lock; if * the page is locked, it is better to skip it and * perhaps try again later. */ if (!folio_trylock(folio)) return; split_huge_page(page); folio = page_folio(page); folio_unlock(folio); } } } static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot, struct ksm_rmap_item **rmap_list, unsigned long addr) { struct ksm_rmap_item *rmap_item; while (*rmap_list) { rmap_item = *rmap_list; if ((rmap_item->address & PAGE_MASK) == addr) return rmap_item; if (rmap_item->address > addr) break; *rmap_list = rmap_item->rmap_list; remove_rmap_item_from_tree(rmap_item); free_rmap_item(rmap_item); } rmap_item = alloc_rmap_item(); if (rmap_item) { /* It has already been zeroed */ rmap_item->mm = mm_slot->slot.mm; rmap_item->mm->ksm_rmap_items++; rmap_item->address = addr; rmap_item->rmap_list = *rmap_list; *rmap_list = rmap_item; } return rmap_item; } /* * Calculate skip age for the ksm page age. The age determines how often * de-duplicating has already been tried unsuccessfully. If the age is * smaller, the scanning of this page is skipped for less scans. * * @age: rmap_item age of page */ static unsigned int skip_age(rmap_age_t age) { if (age <= 3) return 1; if (age <= 5) return 2; if (age <= 8) return 4; return 8; } /* * Determines if a page should be skipped for the current scan. * * @folio: folio containing the page to check * @rmap_item: associated rmap_item of page */ static bool should_skip_rmap_item(struct folio *folio, struct ksm_rmap_item *rmap_item) { rmap_age_t age; if (!ksm_smart_scan) return false; /* * Never skip pages that are already KSM; pages cmp_and_merge_page() * will essentially ignore them, but we still have to process them * properly. */ if (folio_test_ksm(folio)) return false; age = rmap_item->age; if (age != U8_MAX) rmap_item->age++; /* * Smaller ages are not skipped, they need to get a chance to go * through the different phases of the KSM merging. */ if (age < 3) return false; /* * Are we still allowed to skip? If not, then don't skip it * and determine how much more often we are allowed to skip next. */ if (!rmap_item->remaining_skips) { rmap_item->remaining_skips = skip_age(age); return false; } /* Skip this page */ ksm_pages_skipped++; rmap_item->remaining_skips--; remove_rmap_item_from_tree(rmap_item); return true; } static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) { struct mm_struct *mm; struct ksm_mm_slot *mm_slot; struct mm_slot *slot; struct vm_area_struct *vma; struct ksm_rmap_item *rmap_item; struct vma_iterator vmi; int nid; if (list_empty(&ksm_mm_head.slot.mm_node)) return NULL; mm_slot = ksm_scan.mm_slot; if (mm_slot == &ksm_mm_head) { advisor_start_scan(); trace_ksm_start_scan(ksm_scan.seqnr, ksm_rmap_items); /* * A number of pages can hang around indefinitely in per-cpu * LRU cache, raised page count preventing write_protect_page * from merging them. Though it doesn't really matter much, * it is puzzling to see some stuck in pages_volatile until * other activity jostles them out, and they also prevented * LTP's KSM test from succeeding deterministically; so drain * them here (here rather than on entry to ksm_do_scan(), * so we don't IPI too often when pages_to_scan is set low). */ lru_add_drain_all(); /* * Whereas stale stable_nodes on the stable_tree itself * get pruned in the regular course of stable_tree_search(), * those moved out to the migrate_nodes list can accumulate: * so prune them once before each full scan. */ if (!ksm_merge_across_nodes) { struct ksm_stable_node *stable_node, *next; struct folio *folio; list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_NOLOCK); if (folio) folio_put(folio); cond_resched(); } } for (nid = 0; nid < ksm_nr_node_ids; nid++) root_unstable_tree[nid] = RB_ROOT; spin_lock(&ksm_mmlist_lock); slot = list_entry(mm_slot->slot.mm_node.next, struct mm_slot, mm_node); mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); ksm_scan.mm_slot = mm_slot; spin_unlock(&ksm_mmlist_lock); /* * Although we tested list_empty() above, a racing __ksm_exit * of the last mm on the list may have removed it since then. */ if (mm_slot == &ksm_mm_head) return NULL; next_mm: ksm_scan.address = 0; ksm_scan.rmap_list = &mm_slot->rmap_list; } slot = &mm_slot->slot; mm = slot->mm; vma_iter_init(&vmi, mm, ksm_scan.address); mmap_read_lock(mm); if (ksm_test_exit(mm)) goto no_vmas; for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_MERGEABLE)) continue; if (ksm_scan.address < vma->vm_start) ksm_scan.address = vma->vm_start; if (!vma->anon_vma) ksm_scan.address = vma->vm_end; while (ksm_scan.address < vma->vm_end) { struct page *tmp_page = NULL; struct folio_walk fw; struct folio *folio; if (ksm_test_exit(mm)) break; folio = folio_walk_start(&fw, vma, ksm_scan.address, 0); if (folio) { if (!folio_is_zone_device(folio) && folio_test_anon(folio)) { folio_get(folio); tmp_page = fw.page; } folio_walk_end(&fw, vma); } if (tmp_page) { flush_anon_page(vma, tmp_page, ksm_scan.address); flush_dcache_page(tmp_page); rmap_item = get_next_rmap_item(mm_slot, ksm_scan.rmap_list, ksm_scan.address); if (rmap_item) { ksm_scan.rmap_list = &rmap_item->rmap_list; if (should_skip_rmap_item(folio, rmap_item)) { folio_put(folio); goto next_page; } ksm_scan.address += PAGE_SIZE; *page = tmp_page; } else { folio_put(folio); } mmap_read_unlock(mm); return rmap_item; } next_page: ksm_scan.address += PAGE_SIZE; cond_resched(); } } if (ksm_test_exit(mm)) { no_vmas: ksm_scan.address = 0; ksm_scan.rmap_list = &mm_slot->rmap_list; } /* * Nuke all the rmap_items that are above this current rmap: * because there were no VM_MERGEABLE vmas with such addresses. */ remove_trailing_rmap_items(ksm_scan.rmap_list); spin_lock(&ksm_mmlist_lock); slot = list_entry(mm_slot->slot.mm_node.next, struct mm_slot, mm_node); ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); if (ksm_scan.address == 0) { /* * We've completed a full scan of all vmas, holding mmap_lock * throughout, and found no VM_MERGEABLE: so do the same as * __ksm_exit does to remove this mm from all our lists now. * This applies either when cleaning up after __ksm_exit * (but beware: we can reach here even before __ksm_exit), * or when all VM_MERGEABLE areas have been unmapped (and * mmap_lock then protects against race with MADV_MERGEABLE). */ hash_del(&mm_slot->slot.hash); list_del(&mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); mm_slot_free(mm_slot_cache, mm_slot); mm_flags_clear(MMF_VM_MERGEABLE, mm); mm_flags_clear(MMF_VM_MERGE_ANY, mm); mmap_read_unlock(mm); mmdrop(mm); } else { mmap_read_unlock(mm); /* * mmap_read_unlock(mm) first because after * spin_unlock(&ksm_mmlist_lock) run, the "mm" may * already have been freed under us by __ksm_exit() * because the "mm_slot" is still hashed and * ksm_scan.mm_slot doesn't point to it anymore. */ spin_unlock(&ksm_mmlist_lock); } /* Repeat until we've completed scanning the whole list */ mm_slot = ksm_scan.mm_slot; if (mm_slot != &ksm_mm_head) goto next_mm; advisor_stop_scan(); trace_ksm_stop_scan(ksm_scan.seqnr, ksm_rmap_items); ksm_scan.seqnr++; return NULL; } /** * ksm_do_scan - the ksm scanner main worker function. * @scan_npages: number of pages we want to scan before we return. */ static void ksm_do_scan(unsigned int scan_npages) { struct ksm_rmap_item *rmap_item; struct page *page; while (scan_npages-- && likely(!freezing(current))) { cond_resched(); rmap_item = scan_get_next_rmap_item(&page); if (!rmap_item) return; cmp_and_merge_page(page, rmap_item); put_page(page); ksm_pages_scanned++; } } static int ksmd_should_run(void) { return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.slot.mm_node); } static int ksm_scan_thread(void *nothing) { unsigned int sleep_ms; set_freezable(); set_user_nice(current, 5); while (!kthread_should_stop()) { mutex_lock(&ksm_thread_mutex); wait_while_offlining(); if (ksmd_should_run()) ksm_do_scan(ksm_thread_pages_to_scan); mutex_unlock(&ksm_thread_mutex); if (ksmd_should_run()) { sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs); wait_event_freezable_timeout(ksm_iter_wait, sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), msecs_to_jiffies(sleep_ms)); } else { wait_event_freezable(ksm_thread_wait, ksmd_should_run() || kthread_should_stop()); } } return 0; } static bool __ksm_should_add_vma(const struct file *file, vm_flags_t vm_flags) { if (vm_flags & VM_MERGEABLE) return false; return ksm_compatible(file, vm_flags); } static void __ksm_add_vma(struct vm_area_struct *vma) { if (__ksm_should_add_vma(vma->vm_file, vma->vm_flags)) vm_flags_set(vma, VM_MERGEABLE); } static int __ksm_del_vma(struct vm_area_struct *vma) { int err; if (!(vma->vm_flags & VM_MERGEABLE)) return 0; if (vma->anon_vma) { err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end, true); if (err) return err; } vm_flags_clear(vma, VM_MERGEABLE); return 0; } /** * ksm_vma_flags - Update VMA flags to mark as mergeable if compatible * * @mm: Proposed VMA's mm_struct * @file: Proposed VMA's file-backed mapping, if any. * @vm_flags: Proposed VMA"s flags. * * Returns: @vm_flags possibly updated to mark mergeable. */ vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file, vm_flags_t vm_flags) { if (mm_flags_test(MMF_VM_MERGE_ANY, mm) && __ksm_should_add_vma(file, vm_flags)) vm_flags |= VM_MERGEABLE; return vm_flags; } static void ksm_add_vmas(struct mm_struct *mm) { struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) __ksm_add_vma(vma); } static int ksm_del_vmas(struct mm_struct *mm) { struct vm_area_struct *vma; int err; VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) { err = __ksm_del_vma(vma); if (err) return err; } return 0; } /** * ksm_enable_merge_any - Add mm to mm ksm list and enable merging on all * compatible VMA's * * @mm: Pointer to mm * * Returns 0 on success, otherwise error code */ int ksm_enable_merge_any(struct mm_struct *mm) { int err; if (mm_flags_test(MMF_VM_MERGE_ANY, mm)) return 0; if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) { err = __ksm_enter(mm); if (err) return err; } mm_flags_set(MMF_VM_MERGE_ANY, mm); ksm_add_vmas(mm); return 0; } /** * ksm_disable_merge_any - Disable merging on all compatible VMA's of the mm, * previously enabled via ksm_enable_merge_any(). * * Disabling merging implies unmerging any merged pages, like setting * MADV_UNMERGEABLE would. If unmerging fails, the whole operation fails and * merging on all compatible VMA's remains enabled. * * @mm: Pointer to mm * * Returns 0 on success, otherwise error code */ int ksm_disable_merge_any(struct mm_struct *mm) { int err; if (!mm_flags_test(MMF_VM_MERGE_ANY, mm)) return 0; err = ksm_del_vmas(mm); if (err) { ksm_add_vmas(mm); return err; } mm_flags_clear(MMF_VM_MERGE_ANY, mm); return 0; } int ksm_disable(struct mm_struct *mm) { mmap_assert_write_locked(mm); if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) return 0; if (mm_flags_test(MMF_VM_MERGE_ANY, mm)) return ksm_disable_merge_any(mm); return ksm_del_vmas(mm); } int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, vm_flags_t *vm_flags) { struct mm_struct *mm = vma->vm_mm; int err; switch (advice) { case MADV_MERGEABLE: if (vma->vm_flags & VM_MERGEABLE) return 0; if (!vma_ksm_compatible(vma)) return 0; if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) { err = __ksm_enter(mm); if (err) return err; } *vm_flags |= VM_MERGEABLE; break; case MADV_UNMERGEABLE: if (!(*vm_flags & VM_MERGEABLE)) return 0; /* just ignore the advice */ if (vma->anon_vma) { err = unmerge_ksm_pages(vma, start, end, true); if (err) return err; } *vm_flags &= ~VM_MERGEABLE; break; } return 0; } EXPORT_SYMBOL_GPL(ksm_madvise); int __ksm_enter(struct mm_struct *mm) { struct ksm_mm_slot *mm_slot; struct mm_slot *slot; int needs_wakeup; mm_slot = mm_slot_alloc(mm_slot_cache); if (!mm_slot) return -ENOMEM; slot = &mm_slot->slot; /* Check ksm_run too? Would need tighter locking */ needs_wakeup = list_empty(&ksm_mm_head.slot.mm_node); spin_lock(&ksm_mmlist_lock); mm_slot_insert(mm_slots_hash, mm, slot); /* * When KSM_RUN_MERGE (or KSM_RUN_STOP), * insert just behind the scanning cursor, to let the area settle * down a little; when fork is followed by immediate exec, we don't * want ksmd to waste time setting up and tearing down an rmap_list. * * But when KSM_RUN_UNMERGE, it's important to insert ahead of its * scanning cursor, otherwise KSM pages in newly forked mms will be * missed: then we might as well insert at the end of the list. */ if (ksm_run & KSM_RUN_UNMERGE) list_add_tail(&slot->mm_node, &ksm_mm_head.slot.mm_node); else list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); mm_flags_set(MMF_VM_MERGEABLE, mm); mmgrab(mm); if (needs_wakeup) wake_up_interruptible(&ksm_thread_wait); trace_ksm_enter(mm); return 0; } void __ksm_exit(struct mm_struct *mm) { struct ksm_mm_slot *mm_slot = NULL; struct mm_slot *slot; int easy_to_free = 0; /* * This process is exiting: if it's straightforward (as is the * case when ksmd was never running), free mm_slot immediately. * But if it's at the cursor or has rmap_items linked to it, use * mmap_lock to synchronize with any break_cows before pagetables * are freed, and leave the mm_slot on the list for ksmd to free. * Beware: ksm may already have noticed it exiting and freed the slot. */ spin_lock(&ksm_mmlist_lock); slot = mm_slot_lookup(mm_slots_hash, mm); if (!slot) goto unlock; mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); if (ksm_scan.mm_slot == mm_slot) goto unlock; if (!mm_slot->rmap_list) { hash_del(&slot->hash); list_del(&slot->mm_node); easy_to_free = 1; } else { list_move(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node); } unlock: spin_unlock(&ksm_mmlist_lock); if (easy_to_free) { mm_slot_free(mm_slot_cache, mm_slot); mm_flags_clear(MMF_VM_MERGE_ANY, mm); mm_flags_clear(MMF_VM_MERGEABLE, mm); mmdrop(mm); } else if (mm_slot) { mmap_write_lock(mm); mmap_write_unlock(mm); } trace_ksm_exit(mm); } struct folio *ksm_might_need_to_copy(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { struct page *page = folio_page(folio, 0); struct anon_vma *anon_vma = folio_anon_vma(folio); struct folio *new_folio; if (folio_test_large(folio)) return folio; if (folio_test_ksm(folio)) { if (folio_stable_node(folio) && !(ksm_run & KSM_RUN_UNMERGE)) return folio; /* no need to copy it */ } else if (!anon_vma) { return folio; /* no need to copy it */ } else if (folio->index == linear_page_index(vma, addr) && anon_vma->root == vma->anon_vma->root) { return folio; /* still no need to copy it */ } if (PageHWPoison(page)) return ERR_PTR(-EHWPOISON); if (!folio_test_uptodate(folio)) return folio; /* let do_swap_page report the error */ new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr); if (new_folio && mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL)) { folio_put(new_folio); new_folio = NULL; } if (new_folio) { if (copy_mc_user_highpage(folio_page(new_folio, 0), page, addr, vma)) { folio_put(new_folio); return ERR_PTR(-EHWPOISON); } folio_set_dirty(new_folio); __folio_mark_uptodate(new_folio); __folio_set_locked(new_folio); #ifdef CONFIG_SWAP count_vm_event(KSM_SWPIN_COPY); #endif } return new_folio; } void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) { struct ksm_stable_node *stable_node; struct ksm_rmap_item *rmap_item; int search_new_forks = 0; VM_BUG_ON_FOLIO(!folio_test_ksm(folio), folio); /* * Rely on the page lock to protect against concurrent modifications * to that page's node of the stable tree. */ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); stable_node = folio_stable_node(folio); if (!stable_node) return; again: hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { struct anon_vma *anon_vma = rmap_item->anon_vma; struct anon_vma_chain *vmac; struct vm_area_struct *vma; cond_resched(); if (!anon_vma_trylock_read(anon_vma)) { if (rwc->try_lock) { rwc->contended = true; return; } anon_vma_lock_read(anon_vma); } anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { unsigned long addr; cond_resched(); vma = vmac->vma; /* Ignore the stable/unstable/sqnr flags */ addr = rmap_item->address & PAGE_MASK; if (addr < vma->vm_start || addr >= vma->vm_end) continue; /* * Initially we examine only the vma which covers this * rmap_item; but later, if there is still work to do, * we examine covering vmas in other mms: in case they * were forked from the original since ksmd passed. */ if ((rmap_item->mm == vma->vm_mm) == search_new_forks) continue; if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; if (!rwc->rmap_one(folio, vma, addr, rwc->arg)) { anon_vma_unlock_read(anon_vma); return; } if (rwc->done && rwc->done(folio)) { anon_vma_unlock_read(anon_vma); return; } } anon_vma_unlock_read(anon_vma); } if (!search_new_forks++) goto again; } #ifdef CONFIG_MEMORY_FAILURE /* * Collect processes when the error hit an ksm page. */ void collect_procs_ksm(const struct folio *folio, const struct page *page, struct list_head *to_kill, int force_early) { struct ksm_stable_node *stable_node; struct ksm_rmap_item *rmap_item; struct vm_area_struct *vma; struct task_struct *tsk; stable_node = folio_stable_node(folio); if (!stable_node) return; hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { struct anon_vma *av = rmap_item->anon_vma; anon_vma_lock_read(av); rcu_read_lock(); for_each_process(tsk) { struct anon_vma_chain *vmac; unsigned long addr; struct task_struct *t = task_early_kill(tsk, force_early); if (!t) continue; anon_vma_interval_tree_foreach(vmac, &av->rb_root, 0, ULONG_MAX) { vma = vmac->vma; if (vma->vm_mm == t->mm) { addr = rmap_item->address & PAGE_MASK; add_to_kill_ksm(t, page, vma, to_kill, addr); } } } rcu_read_unlock(); anon_vma_unlock_read(av); } } #endif #ifdef CONFIG_MIGRATION void folio_migrate_ksm(struct folio *newfolio, struct folio *folio) { struct ksm_stable_node *stable_node; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio); VM_BUG_ON_FOLIO(newfolio->mapping != folio->mapping, newfolio); stable_node = folio_stable_node(folio); if (stable_node) { VM_BUG_ON_FOLIO(stable_node->kpfn != folio_pfn(folio), folio); stable_node->kpfn = folio_pfn(newfolio); /* * newfolio->mapping was set in advance; now we need smp_wmb() * to make sure that the new stable_node->kpfn is visible * to ksm_get_folio() before it can see that folio->mapping * has gone stale (or that the swapcache flag has been cleared). */ smp_wmb(); folio_set_stable_node(folio, NULL); } } #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_MEMORY_HOTREMOVE static void wait_while_offlining(void) { while (ksm_run & KSM_RUN_OFFLINE) { mutex_unlock(&ksm_thread_mutex); wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE), TASK_UNINTERRUPTIBLE); mutex_lock(&ksm_thread_mutex); } } static bool stable_node_dup_remove_range(struct ksm_stable_node *stable_node, unsigned long start_pfn, unsigned long end_pfn) { if (stable_node->kpfn >= start_pfn && stable_node->kpfn < end_pfn) { /* * Don't ksm_get_folio, page has already gone: * which is why we keep kpfn instead of page* */ remove_node_from_stable_tree(stable_node); return true; } return false; } static bool stable_node_chain_remove_range(struct ksm_stable_node *stable_node, unsigned long start_pfn, unsigned long end_pfn, struct rb_root *root) { struct ksm_stable_node *dup; struct hlist_node *hlist_safe; if (!is_stable_node_chain(stable_node)) { VM_BUG_ON(is_stable_node_dup(stable_node)); return stable_node_dup_remove_range(stable_node, start_pfn, end_pfn); } hlist_for_each_entry_safe(dup, hlist_safe, &stable_node->hlist, hlist_dup) { VM_BUG_ON(!is_stable_node_dup(dup)); stable_node_dup_remove_range(dup, start_pfn, end_pfn); } if (hlist_empty(&stable_node->hlist)) { free_stable_node_chain(stable_node, root); return true; /* notify caller that tree was rebalanced */ } else return false; } static void ksm_check_stable_tree(unsigned long start_pfn, unsigned long end_pfn) { struct ksm_stable_node *stable_node, *next; struct rb_node *node; int nid; for (nid = 0; nid < ksm_nr_node_ids; nid++) { node = rb_first(root_stable_tree + nid); while (node) { stable_node = rb_entry(node, struct ksm_stable_node, node); if (stable_node_chain_remove_range(stable_node, start_pfn, end_pfn, root_stable_tree + nid)) node = rb_first(root_stable_tree + nid); else node = rb_next(node); cond_resched(); } } list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { if (stable_node->kpfn >= start_pfn && stable_node->kpfn < end_pfn) remove_node_from_stable_tree(stable_node); cond_resched(); } } static int ksm_memory_callback(struct notifier_block *self, unsigned long action, void *arg) { struct memory_notify *mn = arg; switch (action) { case MEM_GOING_OFFLINE: /* * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items() * and remove_all_stable_nodes() while memory is going offline: * it is unsafe for them to touch the stable tree at this time. * But unmerge_ksm_pages(), rmap lookups and other entry points * which do not need the ksm_thread_mutex are all safe. */ mutex_lock(&ksm_thread_mutex); ksm_run |= KSM_RUN_OFFLINE; mutex_unlock(&ksm_thread_mutex); break; case MEM_OFFLINE: /* * Most of the work is done by page migration; but there might * be a few stable_nodes left over, still pointing to struct * pages which have been offlined: prune those from the tree, * otherwise ksm_get_folio() might later try to access a * non-existent struct page. */ ksm_check_stable_tree(mn->start_pfn, mn->start_pfn + mn->nr_pages); fallthrough; case MEM_CANCEL_OFFLINE: mutex_lock(&ksm_thread_mutex); ksm_run &= ~KSM_RUN_OFFLINE; mutex_unlock(&ksm_thread_mutex); smp_mb(); /* wake_up_bit advises this */ wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE)); break; } return NOTIFY_OK; } #else static void wait_while_offlining(void) { } #endif /* CONFIG_MEMORY_HOTREMOVE */ #ifdef CONFIG_PROC_FS /* * The process is mergeable only if any VMA is currently * applicable to KSM. * * The mmap lock must be held in read mode. */ bool ksm_process_mergeable(struct mm_struct *mm) { struct vm_area_struct *vma; mmap_assert_locked(mm); VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) if (vma->vm_flags & VM_MERGEABLE) return true; return false; } long ksm_process_profit(struct mm_struct *mm) { return (long)(mm->ksm_merging_pages + mm_ksm_zero_pages(mm)) * PAGE_SIZE - mm->ksm_rmap_items * sizeof(struct ksm_rmap_item); } #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SYSFS /* * This all compiles without CONFIG_SYSFS, but is a waste of space. */ #define KSM_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) #define KSM_ATTR(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RW(_name) static ssize_t sleep_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_thread_sleep_millisecs); } static ssize_t sleep_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int msecs; int err; err = kstrtouint(buf, 10, &msecs); if (err) return -EINVAL; ksm_thread_sleep_millisecs = msecs; wake_up_interruptible(&ksm_iter_wait); return count; } KSM_ATTR(sleep_millisecs); static ssize_t pages_to_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_thread_pages_to_scan); } static ssize_t pages_to_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int nr_pages; int err; if (ksm_advisor != KSM_ADVISOR_NONE) return -EINVAL; err = kstrtouint(buf, 10, &nr_pages); if (err) return -EINVAL; ksm_thread_pages_to_scan = nr_pages; return count; } KSM_ATTR(pages_to_scan); static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_run); } static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int flags; int err; err = kstrtouint(buf, 10, &flags); if (err) return -EINVAL; if (flags > KSM_RUN_UNMERGE) return -EINVAL; /* * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, * breaking COW to free the pages_shared (but leaves mm_slots * on the list for when ksmd may be set running again). */ mutex_lock(&ksm_thread_mutex); wait_while_offlining(); if (ksm_run != flags) { ksm_run = flags; if (flags & KSM_RUN_UNMERGE) { set_current_oom_origin(); err = unmerge_and_remove_all_rmap_items(); clear_current_oom_origin(); if (err) { ksm_run = KSM_RUN_STOP; count = err; } } } mutex_unlock(&ksm_thread_mutex); if (flags & KSM_RUN_MERGE) wake_up_interruptible(&ksm_thread_wait); return count; } KSM_ATTR(run); #ifdef CONFIG_NUMA static ssize_t merge_across_nodes_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_merge_across_nodes); } static ssize_t merge_across_nodes_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long knob; err = kstrtoul(buf, 10, &knob); if (err) return err; if (knob > 1) return -EINVAL; mutex_lock(&ksm_thread_mutex); wait_while_offlining(); if (ksm_merge_across_nodes != knob) { if (ksm_pages_shared || remove_all_stable_nodes()) err = -EBUSY; else if (root_stable_tree == one_stable_tree) { struct rb_root *buf; /* * This is the first time that we switch away from the * default of merging across nodes: must now allocate * a buffer to hold as many roots as may be needed. * Allocate stable and unstable together: * MAXSMP NODES_SHIFT 10 will use 16kB. */ buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf), GFP_KERNEL); /* Let us assume that RB_ROOT is NULL is zero */ if (!buf) err = -ENOMEM; else { root_stable_tree = buf; root_unstable_tree = buf + nr_node_ids; /* Stable tree is empty but not the unstable */ root_unstable_tree[0] = one_unstable_tree[0]; } } if (!err) { ksm_merge_across_nodes = knob; ksm_nr_node_ids = knob ? 1 : nr_node_ids; } } mutex_unlock(&ksm_thread_mutex); return err ? err : count; } KSM_ATTR(merge_across_nodes); #endif static ssize_t use_zero_pages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_use_zero_pages); } static ssize_t use_zero_pages_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; bool value; err = kstrtobool(buf, &value); if (err) return -EINVAL; ksm_use_zero_pages = value; return count; } KSM_ATTR(use_zero_pages); static ssize_t max_page_sharing_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_max_page_sharing); } static ssize_t max_page_sharing_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; int knob; err = kstrtoint(buf, 10, &knob); if (err) return err; /* * When a KSM page is created it is shared by 2 mappings. This * being a signed comparison, it implicitly verifies it's not * negative. */ if (knob < 2) return -EINVAL; if (READ_ONCE(ksm_max_page_sharing) == knob) return count; mutex_lock(&ksm_thread_mutex); wait_while_offlining(); if (ksm_max_page_sharing != knob) { if (ksm_pages_shared || remove_all_stable_nodes()) err = -EBUSY; else ksm_max_page_sharing = knob; } mutex_unlock(&ksm_thread_mutex); return err ? err : count; } KSM_ATTR(max_page_sharing); static ssize_t pages_scanned_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_pages_scanned); } KSM_ATTR_RO(pages_scanned); static ssize_t pages_shared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_pages_shared); } KSM_ATTR_RO(pages_shared); static ssize_t pages_sharing_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_pages_sharing); } KSM_ATTR_RO(pages_sharing); static ssize_t pages_unshared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_pages_unshared); } KSM_ATTR_RO(pages_unshared); static ssize_t pages_volatile_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { long ksm_pages_volatile; ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared - ksm_pages_sharing - ksm_pages_unshared; /* * It was not worth any locking to calculate that statistic, * but it might therefore sometimes be negative: conceal that. */ if (ksm_pages_volatile < 0) ksm_pages_volatile = 0; return sysfs_emit(buf, "%ld\n", ksm_pages_volatile); } KSM_ATTR_RO(pages_volatile); static ssize_t pages_skipped_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_pages_skipped); } KSM_ATTR_RO(pages_skipped); static ssize_t ksm_zero_pages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%ld\n", atomic_long_read(&ksm_zero_pages)); } KSM_ATTR_RO(ksm_zero_pages); static ssize_t general_profit_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { long general_profit; general_profit = (ksm_pages_sharing + atomic_long_read(&ksm_zero_pages)) * PAGE_SIZE - ksm_rmap_items * sizeof(struct ksm_rmap_item); return sysfs_emit(buf, "%ld\n", general_profit); } KSM_ATTR_RO(general_profit); static ssize_t stable_node_dups_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_stable_node_dups); } KSM_ATTR_RO(stable_node_dups); static ssize_t stable_node_chains_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_stable_node_chains); } KSM_ATTR_RO(stable_node_chains); static ssize_t stable_node_chains_prune_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_stable_node_chains_prune_millisecs); } static ssize_t stable_node_chains_prune_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int msecs; int err; err = kstrtouint(buf, 10, &msecs); if (err) return -EINVAL; ksm_stable_node_chains_prune_millisecs = msecs; return count; } KSM_ATTR(stable_node_chains_prune_millisecs); static ssize_t full_scans_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_scan.seqnr); } KSM_ATTR_RO(full_scans); static ssize_t smart_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_smart_scan); } static ssize_t smart_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; bool value; err = kstrtobool(buf, &value); if (err) return -EINVAL; ksm_smart_scan = value; return count; } KSM_ATTR(smart_scan); static ssize_t advisor_mode_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { const char *output; if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) output = "none [scan-time]"; else output = "[none] scan-time"; return sysfs_emit(buf, "%s\n", output); } static ssize_t advisor_mode_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { enum ksm_advisor_type curr_advisor = ksm_advisor; if (sysfs_streq("scan-time", buf)) ksm_advisor = KSM_ADVISOR_SCAN_TIME; else if (sysfs_streq("none", buf)) ksm_advisor = KSM_ADVISOR_NONE; else return -EINVAL; /* Set advisor default values */ if (curr_advisor != ksm_advisor) set_advisor_defaults(); return count; } KSM_ATTR(advisor_mode); static ssize_t advisor_max_cpu_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_advisor_max_cpu); } static ssize_t advisor_max_cpu_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long value; err = kstrtoul(buf, 10, &value); if (err) return -EINVAL; ksm_advisor_max_cpu = value; return count; } KSM_ATTR(advisor_max_cpu); static ssize_t advisor_min_pages_to_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_advisor_min_pages_to_scan); } static ssize_t advisor_min_pages_to_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long value; err = kstrtoul(buf, 10, &value); if (err) return -EINVAL; ksm_advisor_min_pages_to_scan = value; return count; } KSM_ATTR(advisor_min_pages_to_scan); static ssize_t advisor_max_pages_to_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_advisor_max_pages_to_scan); } static ssize_t advisor_max_pages_to_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long value; err = kstrtoul(buf, 10, &value); if (err) return -EINVAL; ksm_advisor_max_pages_to_scan = value; return count; } KSM_ATTR(advisor_max_pages_to_scan); static ssize_t advisor_target_scan_time_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_advisor_target_scan_time); } static ssize_t advisor_target_scan_time_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long value; err = kstrtoul(buf, 10, &value); if (err) return -EINVAL; if (value < 1) return -EINVAL; ksm_advisor_target_scan_time = value; return count; } KSM_ATTR(advisor_target_scan_time); static struct attribute *ksm_attrs[] = { &sleep_millisecs_attr.attr, &pages_to_scan_attr.attr, &run_attr.attr, &pages_scanned_attr.attr, &pages_shared_attr.attr, &pages_sharing_attr.attr, &pages_unshared_attr.attr, &pages_volatile_attr.attr, &pages_skipped_attr.attr, &ksm_zero_pages_attr.attr, &full_scans_attr.attr, #ifdef CONFIG_NUMA &merge_across_nodes_attr.attr, #endif &max_page_sharing_attr.attr, &stable_node_chains_attr.attr, &stable_node_dups_attr.attr, &stable_node_chains_prune_millisecs_attr.attr, &use_zero_pages_attr.attr, &general_profit_attr.attr, &smart_scan_attr.attr, &advisor_mode_attr.attr, &advisor_max_cpu_attr.attr, &advisor_min_pages_to_scan_attr.attr, &advisor_max_pages_to_scan_attr.attr, &advisor_target_scan_time_attr.attr, NULL, }; static const struct attribute_group ksm_attr_group = { .attrs = ksm_attrs, .name = "ksm", }; #endif /* CONFIG_SYSFS */ static int __init ksm_init(void) { struct task_struct *ksm_thread; int err; /* The correct value depends on page size and endianness */ zero_checksum = calc_checksum(ZERO_PAGE(0)); /* Default to false for backwards compatibility */ ksm_use_zero_pages = false; err = ksm_slab_init(); if (err) goto out; ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); if (IS_ERR(ksm_thread)) { pr_err("ksm: creating kthread failed\n"); err = PTR_ERR(ksm_thread); goto out_free; } #ifdef CONFIG_SYSFS err = sysfs_create_group(mm_kobj, &ksm_attr_group); if (err) { pr_err("ksm: register sysfs failed\n"); kthread_stop(ksm_thread); goto out_free; } #else ksm_run = KSM_RUN_MERGE; /* no way for user to start it */ #endif /* CONFIG_SYSFS */ #ifdef CONFIG_MEMORY_HOTREMOVE /* There is no significance to this priority 100 */ hotplug_memory_notifier(ksm_memory_callback, KSM_CALLBACK_PRI); #endif return 0; out_free: ksm_slab_free(); out: return err; } subsys_initcall(ksm_init);
1 1 2 2 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 // SPDX-License-Identifier: GPL-2.0-only /* * Input layer to RF Kill interface connector * * Copyright (c) 2007 Dmitry Torokhov * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> * * If you ever run into a situation in which you have a SW_ type rfkill * input device, then you can revive code that was removed in the patch * "rfkill-input: remove unused code". */ #include <linux/input.h> #include <linux/slab.h> #include <linux/moduleparam.h> #include <linux/workqueue.h> #include <linux/init.h> #include <linux/rfkill.h> #include <linux/sched.h> #include "rfkill.h" enum rfkill_input_master_mode { RFKILL_INPUT_MASTER_UNLOCK = 0, RFKILL_INPUT_MASTER_RESTORE = 1, RFKILL_INPUT_MASTER_UNBLOCKALL = 2, NUM_RFKILL_INPUT_MASTER_MODES }; /* Delay (in ms) between consecutive switch ops */ #define RFKILL_OPS_DELAY 200 static enum rfkill_input_master_mode rfkill_master_switch_mode = RFKILL_INPUT_MASTER_UNBLOCKALL; module_param_named(master_switch_mode, rfkill_master_switch_mode, uint, 0); MODULE_PARM_DESC(master_switch_mode, "SW_RFKILL_ALL ON should: 0=do nothing (only unlock); 1=restore; 2=unblock all"); static DEFINE_SPINLOCK(rfkill_op_lock); static bool rfkill_op_pending; static unsigned long rfkill_sw_pending[BITS_TO_LONGS(NUM_RFKILL_TYPES)]; static unsigned long rfkill_sw_state[BITS_TO_LONGS(NUM_RFKILL_TYPES)]; enum rfkill_sched_op { RFKILL_GLOBAL_OP_EPO = 0, RFKILL_GLOBAL_OP_RESTORE, RFKILL_GLOBAL_OP_UNLOCK, RFKILL_GLOBAL_OP_UNBLOCK, }; static enum rfkill_sched_op rfkill_master_switch_op; static enum rfkill_sched_op rfkill_op; static void __rfkill_handle_global_op(enum rfkill_sched_op op) { unsigned int i; switch (op) { case RFKILL_GLOBAL_OP_EPO: rfkill_epo(); break; case RFKILL_GLOBAL_OP_RESTORE: rfkill_restore_states(); break; case RFKILL_GLOBAL_OP_UNLOCK: rfkill_remove_epo_lock(); break; case RFKILL_GLOBAL_OP_UNBLOCK: rfkill_remove_epo_lock(); for (i = 0; i < NUM_RFKILL_TYPES; i++) rfkill_switch_all(i, false); break; default: /* memory corruption or bug, fail safely */ rfkill_epo(); WARN(1, "Unknown requested operation %d! " "rfkill Emergency Power Off activated\n", op); } } static void __rfkill_handle_normal_op(const enum rfkill_type type, const bool complement) { bool blocked; blocked = rfkill_get_global_sw_state(type); if (complement) blocked = !blocked; rfkill_switch_all(type, blocked); } static void rfkill_op_handler(struct work_struct *work) { unsigned int i; bool c; spin_lock_irq(&rfkill_op_lock); do { if (rfkill_op_pending) { enum rfkill_sched_op op = rfkill_op; rfkill_op_pending = false; memset(rfkill_sw_pending, 0, sizeof(rfkill_sw_pending)); spin_unlock_irq(&rfkill_op_lock); __rfkill_handle_global_op(op); spin_lock_irq(&rfkill_op_lock); /* * handle global ops first -- during unlocked period * we might have gotten a new global op. */ if (rfkill_op_pending) continue; } if (rfkill_is_epo_lock_active()) continue; for (i = 0; i < NUM_RFKILL_TYPES; i++) { if (__test_and_clear_bit(i, rfkill_sw_pending)) { c = __test_and_clear_bit(i, rfkill_sw_state); spin_unlock_irq(&rfkill_op_lock); __rfkill_handle_normal_op(i, c); spin_lock_irq(&rfkill_op_lock); } } } while (rfkill_op_pending); spin_unlock_irq(&rfkill_op_lock); } static DECLARE_DELAYED_WORK(rfkill_op_work, rfkill_op_handler); static unsigned long rfkill_last_scheduled; static unsigned long rfkill_ratelimit(const unsigned long last) { const unsigned long delay = msecs_to_jiffies(RFKILL_OPS_DELAY); return time_after(jiffies, last + delay) ? 0 : delay; } static void rfkill_schedule_ratelimited(void) { if (schedule_delayed_work(&rfkill_op_work, rfkill_ratelimit(rfkill_last_scheduled))) rfkill_last_scheduled = jiffies; } static void rfkill_schedule_global_op(enum rfkill_sched_op op) { unsigned long flags; spin_lock_irqsave(&rfkill_op_lock, flags); rfkill_op = op; rfkill_op_pending = true; if (op == RFKILL_GLOBAL_OP_EPO && !rfkill_is_epo_lock_active()) { /* bypass the limiter for EPO */ mod_delayed_work(system_percpu_wq, &rfkill_op_work, 0); rfkill_last_scheduled = jiffies; } else rfkill_schedule_ratelimited(); spin_unlock_irqrestore(&rfkill_op_lock, flags); } static void rfkill_schedule_toggle(enum rfkill_type type) { unsigned long flags; if (rfkill_is_epo_lock_active()) return; spin_lock_irqsave(&rfkill_op_lock, flags); if (!rfkill_op_pending) { __set_bit(type, rfkill_sw_pending); __change_bit(type, rfkill_sw_state); rfkill_schedule_ratelimited(); } spin_unlock_irqrestore(&rfkill_op_lock, flags); } static void rfkill_schedule_evsw_rfkillall(int state) { if (state) rfkill_schedule_global_op(rfkill_master_switch_op); else rfkill_schedule_global_op(RFKILL_GLOBAL_OP_EPO); } static void rfkill_event(struct input_handle *handle, unsigned int type, unsigned int code, int data) { if (type == EV_KEY && data == 1) { switch (code) { case KEY_WLAN: rfkill_schedule_toggle(RFKILL_TYPE_WLAN); break; case KEY_BLUETOOTH: rfkill_schedule_toggle(RFKILL_TYPE_BLUETOOTH); break; case KEY_UWB: rfkill_schedule_toggle(RFKILL_TYPE_UWB); break; case KEY_WIMAX: rfkill_schedule_toggle(RFKILL_TYPE_WIMAX); break; case KEY_RFKILL: rfkill_schedule_toggle(RFKILL_TYPE_ALL); break; } } else if (type == EV_SW && code == SW_RFKILL_ALL) rfkill_schedule_evsw_rfkillall(data); } static int rfkill_connect(struct input_handler *handler, struct input_dev *dev, const struct input_device_id *id) { struct input_handle *handle; int error; handle = kzalloc(sizeof(struct input_handle), GFP_KERNEL); if (!handle) return -ENOMEM; handle->dev = dev; handle->handler = handler; handle->name = "rfkill"; /* causes rfkill_start() to be called */ error = input_register_handle(handle); if (error) goto err_free_handle; error = input_open_device(handle); if (error) goto err_unregister_handle; return 0; err_unregister_handle: input_unregister_handle(handle); err_free_handle: kfree(handle); return error; } static void rfkill_start(struct input_handle *handle) { /* * Take event_lock to guard against configuration changes, we * should be able to deal with concurrency with rfkill_event() * just fine (which event_lock will also avoid). */ spin_lock_irq(&handle->dev->event_lock); if (test_bit(EV_SW, handle->dev->evbit) && test_bit(SW_RFKILL_ALL, handle->dev->swbit)) rfkill_schedule_evsw_rfkillall(test_bit(SW_RFKILL_ALL, handle->dev->sw)); spin_unlock_irq(&handle->dev->event_lock); } static void rfkill_disconnect(struct input_handle *handle) { input_close_device(handle); input_unregister_handle(handle); kfree(handle); } static const struct input_device_id rfkill_ids[] = { { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT, .evbit = { BIT_MASK(EV_KEY) }, .keybit = { [BIT_WORD(KEY_WLAN)] = BIT_MASK(KEY_WLAN) }, }, { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT, .evbit = { BIT_MASK(EV_KEY) }, .keybit = { [BIT_WORD(KEY_BLUETOOTH)] = BIT_MASK(KEY_BLUETOOTH) }, }, { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT, .evbit = { BIT_MASK(EV_KEY) }, .keybit = { [BIT_WORD(KEY_UWB)] = BIT_MASK(KEY_UWB) }, }, { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT, .evbit = { BIT_MASK(EV_KEY) }, .keybit = { [BIT_WORD(KEY_WIMAX)] = BIT_MASK(KEY_WIMAX) }, }, { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT, .evbit = { BIT_MASK(EV_KEY) }, .keybit = { [BIT_WORD(KEY_RFKILL)] = BIT_MASK(KEY_RFKILL) }, }, { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_SWBIT, .evbit = { BIT(EV_SW) }, .swbit = { [BIT_WORD(SW_RFKILL_ALL)] = BIT_MASK(SW_RFKILL_ALL) }, }, { } }; static struct input_handler rfkill_handler = { .name = "rfkill", .event = rfkill_event, .connect = rfkill_connect, .start = rfkill_start, .disconnect = rfkill_disconnect, .id_table = rfkill_ids, }; int __init rfkill_handler_init(void) { switch (rfkill_master_switch_mode) { case RFKILL_INPUT_MASTER_UNBLOCKALL: rfkill_master_switch_op = RFKILL_GLOBAL_OP_UNBLOCK; break; case RFKILL_INPUT_MASTER_RESTORE: rfkill_master_switch_op = RFKILL_GLOBAL_OP_RESTORE; break; case RFKILL_INPUT_MASTER_UNLOCK: rfkill_master_switch_op = RFKILL_GLOBAL_OP_UNLOCK; break; default: return -EINVAL; } /* Avoid delay at first schedule */ rfkill_last_scheduled = jiffies - msecs_to_jiffies(RFKILL_OPS_DELAY) - 1; return input_register_handler(&rfkill_handler); } void __exit rfkill_handler_exit(void) { input_unregister_handler(&rfkill_handler); cancel_delayed_work_sync(&rfkill_op_work); }
8 8 8 8 23 14 14 23 2 28 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 /* * Linear conversion Plug-In * Copyright (c) 1999 by Jaroslav Kysela <perex@perex.cz>, * Abramo Bagnara <abramo@alsa-project.org> * * * This library is free software; you can redistribute it and/or modify * it under the terms of the GNU Library General Public License as * published by the Free Software Foundation; either version 2 of * the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ #include <linux/time.h> #include <sound/core.h> #include <sound/pcm.h> #include "pcm_plugin.h" /* * Basic linear conversion plugin */ struct linear_priv { int cvt_endian; /* need endian conversion? */ unsigned int src_ofs; /* byte offset in source format */ unsigned int dst_ofs; /* byte soffset in destination format */ unsigned int copy_ofs; /* byte offset in temporary u32 data */ unsigned int dst_bytes; /* byte size of destination format */ unsigned int copy_bytes; /* bytes to copy per conversion */ unsigned int flip; /* MSB flip for signeness, done after endian conv */ }; static inline void do_convert(struct linear_priv *data, unsigned char *dst, unsigned char *src) { unsigned int tmp = 0; unsigned char *p = (unsigned char *)&tmp; memcpy(p + data->copy_ofs, src + data->src_ofs, data->copy_bytes); if (data->cvt_endian) tmp = swab32(tmp); tmp ^= data->flip; memcpy(dst, p + data->dst_ofs, data->dst_bytes); } static void convert(struct snd_pcm_plugin *plugin, const struct snd_pcm_plugin_channel *src_channels, struct snd_pcm_plugin_channel *dst_channels, snd_pcm_uframes_t frames) { struct linear_priv *data = (struct linear_priv *)plugin->extra_data; int channel; int nchannels = plugin->src_format.channels; for (channel = 0; channel < nchannels; ++channel) { char *src; char *dst; int src_step, dst_step; snd_pcm_uframes_t frames1; if (!src_channels[channel].enabled) { if (dst_channels[channel].wanted) snd_pcm_area_silence(&dst_channels[channel].area, 0, frames, plugin->dst_format.format); dst_channels[channel].enabled = 0; continue; } dst_channels[channel].enabled = 1; src = src_channels[channel].area.addr + src_channels[channel].area.first / 8; dst = dst_channels[channel].area.addr + dst_channels[channel].area.first / 8; src_step = src_channels[channel].area.step / 8; dst_step = dst_channels[channel].area.step / 8; frames1 = frames; while (frames1-- > 0) { do_convert(data, dst, src); src += src_step; dst += dst_step; } } } static snd_pcm_sframes_t linear_transfer(struct snd_pcm_plugin *plugin, const struct snd_pcm_plugin_channel *src_channels, struct snd_pcm_plugin_channel *dst_channels, snd_pcm_uframes_t frames) { if (snd_BUG_ON(!plugin || !src_channels || !dst_channels)) return -ENXIO; if (frames == 0) return 0; #ifdef CONFIG_SND_DEBUG { unsigned int channel; for (channel = 0; channel < plugin->src_format.channels; channel++) { if (snd_BUG_ON(src_channels[channel].area.first % 8 || src_channels[channel].area.step % 8)) return -ENXIO; if (snd_BUG_ON(dst_channels[channel].area.first % 8 || dst_channels[channel].area.step % 8)) return -ENXIO; } } #endif if (frames > dst_channels[0].frames) frames = dst_channels[0].frames; convert(plugin, src_channels, dst_channels, frames); return frames; } static void init_data(struct linear_priv *data, snd_pcm_format_t src_format, snd_pcm_format_t dst_format) { int src_le, dst_le, src_bytes, dst_bytes; src_bytes = snd_pcm_format_width(src_format) / 8; dst_bytes = snd_pcm_format_width(dst_format) / 8; src_le = snd_pcm_format_little_endian(src_format) > 0; dst_le = snd_pcm_format_little_endian(dst_format) > 0; data->dst_bytes = dst_bytes; data->cvt_endian = src_le != dst_le; data->copy_bytes = src_bytes < dst_bytes ? src_bytes : dst_bytes; if (src_le) { data->copy_ofs = 4 - data->copy_bytes; data->src_ofs = src_bytes - data->copy_bytes; } else data->src_ofs = snd_pcm_format_physical_width(src_format) / 8 - src_bytes; if (dst_le) data->dst_ofs = 4 - data->dst_bytes; else data->dst_ofs = snd_pcm_format_physical_width(dst_format) / 8 - dst_bytes; if (snd_pcm_format_signed(src_format) != snd_pcm_format_signed(dst_format)) { if (dst_le) data->flip = (__force u32)cpu_to_le32(0x80000000); else data->flip = (__force u32)cpu_to_be32(0x80000000); } } int snd_pcm_plugin_build_linear(struct snd_pcm_substream *plug, struct snd_pcm_plugin_format *src_format, struct snd_pcm_plugin_format *dst_format, struct snd_pcm_plugin **r_plugin) { int err; struct linear_priv *data; struct snd_pcm_plugin *plugin; if (snd_BUG_ON(!r_plugin)) return -ENXIO; *r_plugin = NULL; if (snd_BUG_ON(src_format->rate != dst_format->rate)) return -ENXIO; if (snd_BUG_ON(src_format->channels != dst_format->channels)) return -ENXIO; if (snd_BUG_ON(!snd_pcm_format_linear(src_format->format) || !snd_pcm_format_linear(dst_format->format))) return -ENXIO; err = snd_pcm_plugin_build(plug, "linear format conversion", src_format, dst_format, sizeof(struct linear_priv), &plugin); if (err < 0) return err; data = (struct linear_priv *)plugin->extra_data; init_data(data, src_format->format, dst_format->format); plugin->transfer = linear_transfer; *r_plugin = plugin; return 0; }
3 3 2 4 1 26 18 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * ocfs2.h * * Defines macros and structures used in OCFS2 * * Copyright (C) 2002, 2004 Oracle. All rights reserved. */ #ifndef OCFS2_H #define OCFS2_H #include <linux/spinlock.h> #include <linux/sched.h> #include <linux/wait.h> #include <linux/list.h> #include <linux/llist.h> #include <linux/rbtree.h> #include <linux/workqueue.h> #include <linux/kref.h> #include <linux/mutex.h> #include <linux/lockdep.h> #include <linux/jbd2.h> /* For union ocfs2_dlm_lksb */ #include "stackglue.h" #include "ocfs2_fs.h" #include "ocfs2_lockid.h" #include "ocfs2_ioctl.h" /* For struct ocfs2_blockcheck_stats */ #include "blockcheck.h" #include "reservations.h" #include "filecheck.h" /* Caching of metadata buffers */ /* Most user visible OCFS2 inodes will have very few pieces of * metadata, but larger files (including bitmaps, etc) must be taken * into account when designing an access scheme. We allow a small * amount of inlined blocks to be stored on an array and grow the * structure into a rb tree when necessary. */ #define OCFS2_CACHE_INFO_MAX_ARRAY 2 /* Flags for ocfs2_caching_info */ enum ocfs2_caching_info_flags { /* Indicates that the metadata cache is using the inline array */ OCFS2_CACHE_FL_INLINE = 1<<1, }; struct ocfs2_caching_operations; struct ocfs2_caching_info { /* * The parent structure provides the locks, but because the * parent structure can differ, it provides locking operations * to struct ocfs2_caching_info. */ const struct ocfs2_caching_operations *ci_ops; /* next two are protected by trans_inc_lock */ /* which transaction were we created on? Zero if none. */ unsigned long ci_created_trans; /* last transaction we were a part of. */ unsigned long ci_last_trans; /* Cache structures */ unsigned int ci_flags; unsigned int ci_num_cached; union { sector_t ci_array[OCFS2_CACHE_INFO_MAX_ARRAY]; struct rb_root ci_tree; } ci_cache; }; /* * Need this prototype here instead of in uptodate.h because journal.h * uses it. */ struct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci); /* this limits us to 256 nodes * if we need more, we can do a kmalloc for the map */ #define OCFS2_NODE_MAP_MAX_NODES 256 struct ocfs2_node_map { u16 num_nodes; unsigned long map[BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES)]; }; enum ocfs2_ast_action { OCFS2_AST_INVALID = 0, OCFS2_AST_ATTACH, OCFS2_AST_CONVERT, OCFS2_AST_DOWNCONVERT, }; /* actions for an unlockast function to take. */ enum ocfs2_unlock_action { OCFS2_UNLOCK_INVALID = 0, OCFS2_UNLOCK_CANCEL_CONVERT, OCFS2_UNLOCK_DROP_LOCK, }; /* ocfs2_lock_res->l_flags flags. */ #define OCFS2_LOCK_ATTACHED (0x00000001) /* we have initialized * the lvb */ #define OCFS2_LOCK_BUSY (0x00000002) /* we are currently in * dlm_lock */ #define OCFS2_LOCK_BLOCKED (0x00000004) /* blocked waiting to * downconvert*/ #define OCFS2_LOCK_LOCAL (0x00000008) /* newly created inode */ #define OCFS2_LOCK_NEEDS_REFRESH (0x00000010) #define OCFS2_LOCK_REFRESHING (0x00000020) #define OCFS2_LOCK_INITIALIZED (0x00000040) /* track initialization * for shutdown paths */ #define OCFS2_LOCK_FREEING (0x00000080) /* help dlmglue track * when to skip queueing * a lock because it's * about to be * dropped. */ #define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */ #define OCFS2_LOCK_NOCACHE (0x00000200) /* don't use a holder count */ #define OCFS2_LOCK_PENDING (0x00000400) /* This lockres is pending a call to dlm_lock. Only exists with BUSY set. */ #define OCFS2_LOCK_UPCONVERT_FINISHING (0x00000800) /* blocks the dc thread * from downconverting * before the upconvert * has completed */ #define OCFS2_LOCK_NONBLOCK_FINISHED (0x00001000) /* NONBLOCK cluster * lock has already * returned, do not block * dc thread from * downconverting */ struct ocfs2_lock_res_ops; typedef void (*ocfs2_lock_callback)(int status, unsigned long data); #ifdef CONFIG_OCFS2_FS_STATS struct ocfs2_lock_stats { u64 ls_total; /* Total wait in NSEC */ u32 ls_gets; /* Num acquires */ u32 ls_fail; /* Num failed acquires */ /* Storing max wait in usecs saves 24 bytes per inode */ u32 ls_max; /* Max wait in USEC */ u64 ls_last; /* Last unlock time in USEC */ }; #endif struct ocfs2_lock_res { void *l_priv; const struct ocfs2_lock_res_ops *l_ops; struct list_head l_blocked_list; struct list_head l_mask_waiters; struct list_head l_holders; unsigned long l_flags; char l_name[OCFS2_LOCK_ID_MAX_LEN]; unsigned int l_ro_holders; unsigned int l_ex_holders; signed char l_level; signed char l_requested; signed char l_blocking; /* Data packed - type enum ocfs2_lock_type */ unsigned char l_type; /* used from AST/BAST funcs. */ /* Data packed - enum type ocfs2_ast_action */ unsigned char l_action; /* Data packed - enum type ocfs2_unlock_action */ unsigned char l_unlock_action; unsigned int l_pending_gen; spinlock_t l_lock; struct ocfs2_dlm_lksb l_lksb; wait_queue_head_t l_event; struct list_head l_debug_list; #ifdef CONFIG_OCFS2_FS_STATS struct ocfs2_lock_stats l_lock_prmode; /* PR mode stats */ u32 l_lock_refresh; /* Disk refreshes */ u64 l_lock_wait; /* First lock wait time */ struct ocfs2_lock_stats l_lock_exmode; /* EX mode stats */ #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map l_lockdep_map; #endif }; enum ocfs2_orphan_reco_type { ORPHAN_NO_NEED_TRUNCATE = 0, ORPHAN_NEED_TRUNCATE, }; enum ocfs2_orphan_scan_state { ORPHAN_SCAN_ACTIVE, ORPHAN_SCAN_INACTIVE }; struct ocfs2_orphan_scan { struct mutex os_lock; struct ocfs2_super *os_osb; struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */ struct delayed_work os_orphan_scan_work; time64_t os_scantime; /* time this node ran the scan */ u32 os_count; /* tracks node specific scans */ u32 os_seqno; /* tracks cluster wide scans */ atomic_t os_state; /* ACTIVE or INACTIVE */ }; struct ocfs2_dlm_debug { struct kref d_refcnt; u32 d_filter_secs; struct list_head d_lockres_tracking; }; enum ocfs2_vol_state { VOLUME_INIT = 0, VOLUME_MOUNTED, VOLUME_MOUNTED_QUOTAS, VOLUME_DISMOUNTED, VOLUME_DISABLED }; struct ocfs2_alloc_stats { atomic_t moves; atomic_t local_data; atomic_t bitmap_data; atomic_t bg_allocs; atomic_t bg_extends; }; enum ocfs2_local_alloc_state { OCFS2_LA_UNUSED = 0, /* Local alloc will never be used for * this mountpoint. */ OCFS2_LA_ENABLED, /* Local alloc is in use. */ OCFS2_LA_THROTTLED, /* Local alloc is in use, but number * of bits has been reduced. */ OCFS2_LA_DISABLED /* Local alloc has temporarily been * disabled. */ }; enum ocfs2_mount_options { OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Local heartbeat */ OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */ OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */ OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */ OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */ OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */ OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */ OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* Force POSIX access control lists */ OCFS2_MOUNT_NO_POSIX_ACL = 1 << 9, /* Disable POSIX access control lists */ OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */ OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */ OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12, /* Allow concurrent O_DIRECT writes */ OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */ OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */ OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */ OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */ }; #define OCFS2_OSB_SOFT_RO 0x0001 #define OCFS2_OSB_HARD_RO 0x0002 #define OCFS2_OSB_ERROR_FS 0x0004 #define OCFS2_DEFAULT_ATIME_QUANTUM 60 struct ocfs2_triggers { struct jbd2_buffer_trigger_type ot_triggers; int ot_offset; struct super_block *sb; }; enum ocfs2_journal_trigger_type { OCFS2_JTR_DI, OCFS2_JTR_EB, OCFS2_JTR_RB, OCFS2_JTR_GD, OCFS2_JTR_DB, OCFS2_JTR_XB, OCFS2_JTR_DQ, OCFS2_JTR_DR, OCFS2_JTR_DL, OCFS2_JTR_NONE /* This must be the last entry */ }; #define OCFS2_JOURNAL_TRIGGER_COUNT OCFS2_JTR_NONE void ocfs2_initialize_journal_triggers(struct super_block *sb, struct ocfs2_triggers triggers[]); enum ocfs2_recovery_state { OCFS2_REC_ENABLED = 0, OCFS2_REC_QUOTA_WANT_DISABLE, /* * Must be OCFS2_REC_QUOTA_WANT_DISABLE + 1 for * ocfs2_recovery_disable_quota() to work. */ OCFS2_REC_QUOTA_DISABLED, OCFS2_REC_WANT_DISABLE, /* * Must be OCFS2_REC_WANT_DISABLE + 1 for ocfs2_recovery_exit() to work */ OCFS2_REC_DISABLED, }; struct ocfs2_journal; struct ocfs2_slot_info; struct ocfs2_recovery_map; struct ocfs2_replay_map; struct ocfs2_quota_recovery; struct ocfs2_super { struct task_struct *commit_task; struct super_block *sb; struct inode *root_inode; struct inode *sys_root_inode; struct inode *global_system_inodes[NUM_GLOBAL_SYSTEM_INODES]; struct inode **local_system_inodes; struct ocfs2_slot_info *slot_info; u32 *slot_recovery_generations; spinlock_t node_map_lock; u64 root_blkno; u64 system_dir_blkno; u64 bitmap_blkno; u32 bitmap_cpg; char *uuid_str; u32 uuid_hash; u8 *vol_label; u64 first_cluster_group_blkno; u32 fs_generation; u32 s_feature_compat; u32 s_feature_incompat; u32 s_feature_ro_compat; /* Protects s_next_generation, osb_flags and s_inode_steal_slot. * Could protect more on osb as it's very short lived. */ spinlock_t osb_lock; u32 s_next_generation; unsigned long osb_flags; u16 s_inode_steal_slot; u16 s_meta_steal_slot; atomic_t s_num_inodes_stolen; atomic_t s_num_meta_stolen; unsigned long s_mount_opt; unsigned int s_atime_quantum; unsigned int max_slots; unsigned int node_num; int slot_num; int preferred_slot; int s_sectsize_bits; int s_clustersize; int s_clustersize_bits; unsigned int s_xattr_inline_size; atomic_t vol_state; struct mutex recovery_lock; struct ocfs2_recovery_map *recovery_map; struct ocfs2_replay_map *replay_map; struct task_struct *recovery_thread_task; enum ocfs2_recovery_state recovery_state; wait_queue_head_t checkpoint_event; struct ocfs2_journal *journal; unsigned long osb_commit_interval; /* Journal triggers for checksum */ struct ocfs2_triggers s_journal_triggers[OCFS2_JOURNAL_TRIGGER_COUNT]; struct delayed_work la_enable_wq; /* * Must hold local alloc i_rwsem and osb->osb_lock to change * local_alloc_bits. Reads can be done under either lock. */ unsigned int local_alloc_bits; unsigned int local_alloc_default_bits; /* osb_clusters_at_boot can become stale! Do not trust it to * be up to date. */ unsigned int osb_clusters_at_boot; enum ocfs2_local_alloc_state local_alloc_state; /* protected * by osb_lock */ struct buffer_head *local_alloc_bh; u64 la_last_gd; struct ocfs2_reservation_map osb_la_resmap; unsigned int osb_resv_level; unsigned int osb_dir_resv_level; /* Next two fields are for local node slot recovery during * mount. */ struct ocfs2_dinode *local_alloc_copy; struct ocfs2_quota_recovery *quota_rec; struct ocfs2_blockcheck_stats osb_ecc_stats; struct ocfs2_alloc_stats alloc_stats; char dev_str[20]; /* "major,minor" of the device */ u8 osb_stackflags; char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; char osb_cluster_name[OCFS2_CLUSTER_NAME_LEN + 1]; struct ocfs2_cluster_connection *cconn; struct ocfs2_lock_res osb_super_lockres; struct ocfs2_lock_res osb_rename_lockres; struct ocfs2_lock_res osb_nfs_sync_lockres; struct rw_semaphore nfs_sync_rwlock; struct ocfs2_lock_res osb_trim_fs_lockres; struct mutex obs_trim_fs_mutex; struct ocfs2_dlm_debug *osb_dlm_debug; struct dentry *osb_debug_root; wait_queue_head_t recovery_event; spinlock_t dc_task_lock; struct task_struct *dc_task; wait_queue_head_t dc_event; unsigned long dc_wake_sequence; unsigned long dc_work_sequence; /* * Any thread can add locks to the list, but the downconvert * thread is the only one allowed to remove locks. Any change * to this rule requires updating * ocfs2_downconvert_thread_do_work(). */ struct list_head blocked_lock_list; unsigned long blocked_lock_count; /* List of dquot structures to drop last reference to */ struct llist_head dquot_drop_list; struct work_struct dquot_drop_work; wait_queue_head_t osb_mount_event; /* Truncate log info */ struct inode *osb_tl_inode; struct buffer_head *osb_tl_bh; struct delayed_work osb_truncate_log_wq; atomic_t osb_tl_disable; /* * How many clusters in our truncate log. * It must be protected by osb_tl_inode->i_rwsem. */ unsigned int truncated_clusters; struct ocfs2_node_map osb_recovering_orphan_dirs; unsigned int *osb_orphan_wipes; wait_queue_head_t osb_wipe_event; struct ocfs2_orphan_scan osb_orphan_scan; /* used to protect metaecc calculation check of xattr. */ spinlock_t osb_xattr_lock; unsigned int osb_dx_mask; u32 osb_dx_seed[4]; /* the group we used to allocate inodes. */ u64 osb_inode_alloc_group; /* rb tree root for refcount lock. */ struct rb_root osb_rf_lock_tree; struct ocfs2_refcount_tree *osb_ref_tree_lru; struct mutex system_file_mutex; /* * OCFS2 needs to schedule several different types of work which * require cluster locking, disk I/O, recovery waits, etc. Since these * types of work tend to be heavy we avoid using the kernel events * workqueue and schedule on our own. */ struct workqueue_struct *ocfs2_wq; /* sysfs directory per partition */ struct kset *osb_dev_kset; /* file check related stuff */ struct ocfs2_filecheck_sysfs_entry osb_fc_ent; }; #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) /* Useful typedef for passing around journal access functions */ typedef int (*ocfs2_journal_access_func)(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); static inline int ocfs2_should_order_data(struct inode *inode) { if (!S_ISREG(inode->i_mode)) return 0; if (OCFS2_SB(inode->i_sb)->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) return 0; return 1; } static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC) return 1; return 0; } static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb) { /* * Support for sparse files is a pre-requisite */ if (!ocfs2_sparse_alloc(osb)) return 0; if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_UNWRITTEN) return 1; return 0; } static inline int ocfs2_supports_append_dio(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_APPEND_DIO) return 1; return 0; } static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INLINE_DATA) return 1; return 0; } static inline int ocfs2_supports_xattr(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR) return 1; return 0; } static inline int ocfs2_meta_ecc(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_META_ECC) return 1; return 0; } static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS) return 1; return 0; } static inline int ocfs2_supports_discontig_bg(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG) return 1; return 0; } static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb) { if (ocfs2_supports_indexed_dirs(osb)) return OCFS2_DX_LINK_MAX; return OCFS2_LINK_MAX; } static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di) { u32 nlink = le16_to_cpu(di->i_links_count); u32 hi = le16_to_cpu(di->i_links_count_hi); nlink |= (hi << OCFS2_LINKS_HI_SHIFT); return nlink; } static inline void ocfs2_set_links_count(struct ocfs2_dinode *di, u32 nlink) { u16 lo, hi; lo = nlink; hi = nlink >> OCFS2_LINKS_HI_SHIFT; di->i_links_count = cpu_to_le16(lo); di->i_links_count_hi = cpu_to_le16(hi); } static inline void ocfs2_add_links_count(struct ocfs2_dinode *di, int n) { u32 links = ocfs2_read_links_count(di); links += n; ocfs2_set_links_count(di, links); } static inline int ocfs2_refcount_tree(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE) return 1; return 0; } /* set / clear functions because cluster events can make these happen * in parallel so we want the transitions to be atomic. this also * means that any future flags osb_flags must be protected by spinlock * too! */ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb, unsigned long flag) { spin_lock(&osb->osb_lock); osb->osb_flags |= flag; spin_unlock(&osb->osb_lock); } static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb, int hard) { spin_lock(&osb->osb_lock); osb->osb_flags &= ~(OCFS2_OSB_SOFT_RO|OCFS2_OSB_HARD_RO); if (hard) osb->osb_flags |= OCFS2_OSB_HARD_RO; else osb->osb_flags |= OCFS2_OSB_SOFT_RO; spin_unlock(&osb->osb_lock); } static inline int ocfs2_is_hard_readonly(struct ocfs2_super *osb) { int ret; spin_lock(&osb->osb_lock); ret = osb->osb_flags & OCFS2_OSB_HARD_RO; spin_unlock(&osb->osb_lock); return ret; } static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb) { int ret; spin_lock(&osb->osb_lock); ret = osb->osb_flags & OCFS2_OSB_SOFT_RO; spin_unlock(&osb->osb_lock); return ret; } static inline int ocfs2_clusterinfo_valid(struct ocfs2_super *osb) { return (osb->s_feature_incompat & (OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)); } static inline int ocfs2_userspace_stack(struct ocfs2_super *osb) { if (ocfs2_clusterinfo_valid(osb) && memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK, OCFS2_STACK_LABEL_LEN)) return 1; return 0; } static inline int ocfs2_o2cb_stack(struct ocfs2_super *osb) { if (ocfs2_clusterinfo_valid(osb) && !memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK, OCFS2_STACK_LABEL_LEN)) return 1; return 0; } static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb) { return ocfs2_o2cb_stack(osb) && (osb->osb_stackflags & OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT); } static inline int ocfs2_mount_local(struct ocfs2_super *osb) { return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT); } static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb) { return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP); } #define OCFS2_IS_VALID_DINODE(ptr) \ (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE)) #define OCFS2_IS_VALID_EXTENT_BLOCK(ptr) \ (!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE)) #define OCFS2_IS_VALID_GROUP_DESC(ptr) \ (!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE)) #define OCFS2_IS_VALID_XATTR_BLOCK(ptr) \ (!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE)) #define OCFS2_IS_VALID_DIR_TRAILER(ptr) \ (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE)) #define OCFS2_IS_VALID_DX_ROOT(ptr) \ (!strcmp((ptr)->dr_signature, OCFS2_DX_ROOT_SIGNATURE)) #define OCFS2_IS_VALID_DX_LEAF(ptr) \ (!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE)) #define OCFS2_IS_VALID_REFCOUNT_BLOCK(ptr) \ (!strcmp((ptr)->rf_signature, OCFS2_REFCOUNT_BLOCK_SIGNATURE)) static inline unsigned long ino_from_blkno(struct super_block *sb, u64 blkno) { return (unsigned long)(blkno & (u64)ULONG_MAX); } static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb, u32 clusters) { int c_to_b_bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits; return (u64)clusters << c_to_b_bits; } static inline u32 ocfs2_clusters_for_blocks(struct super_block *sb, u64 blocks) { int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits; blocks += (1 << b_to_c_bits) - 1; return (u32)(blocks >> b_to_c_bits); } static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb, u64 blocks) { int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits; return (u32)(blocks >> b_to_c_bits); } static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb, u64 bytes) { int cl_bits = OCFS2_SB(sb)->s_clustersize_bits; unsigned int clusters; bytes += OCFS2_SB(sb)->s_clustersize - 1; /* OCFS2 just cannot have enough clusters to overflow this */ clusters = (unsigned int)(bytes >> cl_bits); return clusters; } static inline unsigned int ocfs2_bytes_to_clusters(struct super_block *sb, u64 bytes) { int cl_bits = OCFS2_SB(sb)->s_clustersize_bits; unsigned int clusters; clusters = (unsigned int)(bytes >> cl_bits); return clusters; } static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb, u64 bytes) { bytes += sb->s_blocksize - 1; return bytes >> sb->s_blocksize_bits; } static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb, u32 clusters) { return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits; } static inline u64 ocfs2_block_to_cluster_start(struct super_block *sb, u64 blocks) { int bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits; unsigned int clusters; clusters = ocfs2_blocks_to_clusters(sb, blocks); return (u64)clusters << bits; } static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb, u64 bytes) { int cl_bits = OCFS2_SB(sb)->s_clustersize_bits; unsigned int clusters; clusters = ocfs2_clusters_for_bytes(sb, bytes); return (u64)clusters << cl_bits; } static inline u64 ocfs2_align_bytes_to_blocks(struct super_block *sb, u64 bytes) { u64 blocks; blocks = ocfs2_blocks_for_bytes(sb, bytes); return blocks << sb->s_blocksize_bits; } static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes) { return (unsigned long)((bytes + 511) >> 9); } static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb, unsigned long pg_index) { u32 clusters = pg_index; unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; if (unlikely(PAGE_SHIFT > cbits)) clusters = pg_index << (PAGE_SHIFT - cbits); else if (PAGE_SHIFT < cbits) clusters = pg_index >> (cbits - PAGE_SHIFT); return clusters; } /* * Find the 1st page index which covers the given clusters. */ static inline pgoff_t ocfs2_align_clusters_to_page_index(struct super_block *sb, u32 clusters) { unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; pgoff_t index = clusters; if (PAGE_SHIFT > cbits) { index = (pgoff_t)clusters >> (PAGE_SHIFT - cbits); } else if (PAGE_SHIFT < cbits) { index = (pgoff_t)clusters << (cbits - PAGE_SHIFT); } return index; } static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb) { unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; unsigned int pages_per_cluster = 1; if (PAGE_SHIFT < cbits) pages_per_cluster = 1 << (cbits - PAGE_SHIFT); return pages_per_cluster; } static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb, unsigned int megs) { BUILD_BUG_ON(OCFS2_MAX_CLUSTERSIZE > 1048576); return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits); } static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb, unsigned int clusters) { return clusters >> (20 - OCFS2_SB(sb)->s_clustersize_bits); } static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap) { __set_bit_le(bit, bitmap); } #define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr)) static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap) { __clear_bit_le(bit, bitmap); } #define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr)) #define ocfs2_test_bit test_bit_le #define ocfs2_find_next_zero_bit find_next_zero_bit_le #define ocfs2_find_next_bit find_next_bit_le static inline void *correct_addr_and_bit_unaligned(int *bit, void *addr) { #if BITS_PER_LONG == 64 *bit += ((unsigned long) addr & 7UL) << 3; addr = (void *) ((unsigned long) addr & ~7UL); #elif BITS_PER_LONG == 32 *bit += ((unsigned long) addr & 3UL) << 3; addr = (void *) ((unsigned long) addr & ~3UL); #else #error "how many bits you are?!" #endif return addr; } static inline void ocfs2_set_bit_unaligned(int bit, void *bitmap) { bitmap = correct_addr_and_bit_unaligned(&bit, bitmap); ocfs2_set_bit(bit, bitmap); } static inline void ocfs2_clear_bit_unaligned(int bit, void *bitmap) { bitmap = correct_addr_and_bit_unaligned(&bit, bitmap); ocfs2_clear_bit(bit, bitmap); } static inline int ocfs2_test_bit_unaligned(int bit, void *bitmap) { bitmap = correct_addr_and_bit_unaligned(&bit, bitmap); return ocfs2_test_bit(bit, bitmap); } static inline int ocfs2_find_next_zero_bit_unaligned(void *bitmap, int max, int start) { int fix = 0, ret, tmpmax; bitmap = correct_addr_and_bit_unaligned(&fix, bitmap); tmpmax = max + fix; start += fix; ret = ocfs2_find_next_zero_bit(bitmap, tmpmax, start) - fix; if (ret > max) return max; return ret; } #endif /* OCFS2_H */
17 2 2 2 10 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) "IPsec: " fmt #include <crypto/hash.h> #include <crypto/utils.h> #include <linux/err.h> #include <linux/module.h> #include <linux/slab.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/ah.h> #include <linux/crypto.h> #include <linux/pfkeyv2.h> #include <linux/scatterlist.h> #include <net/icmp.h> #include <net/protocol.h> struct ah_skb_cb { struct xfrm_skb_cb xfrm; void *tmp; }; #define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0])) static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags, unsigned int size) { unsigned int len; len = size + crypto_ahash_digestsize(ahash); len = ALIGN(len, crypto_tfm_ctx_alignment()); len += sizeof(struct ahash_request) + crypto_ahash_reqsize(ahash); len = ALIGN(len, __alignof__(struct scatterlist)); len += sizeof(struct scatterlist) * nfrags; return kmalloc(len, GFP_ATOMIC); } static inline u8 *ah_tmp_auth(void *tmp, unsigned int offset) { return tmp + offset; } static inline u8 *ah_tmp_icv(void *tmp, unsigned int offset) { return tmp + offset; } static inline struct ahash_request *ah_tmp_req(struct crypto_ahash *ahash, u8 *icv) { struct ahash_request *req; req = (void *)PTR_ALIGN(icv + crypto_ahash_digestsize(ahash), crypto_tfm_ctx_alignment()); ahash_request_set_tfm(req, ahash); return req; } static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash, struct ahash_request *req) { return (void *)ALIGN((unsigned long)(req + 1) + crypto_ahash_reqsize(ahash), __alignof__(struct scatterlist)); } /* Clear mutable options and find final destination to substitute * into IP header for icv calculation. Options are already checked * for validity, so paranoia is not required. */ static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr) { unsigned char *optptr = (unsigned char *)(iph+1); int l = iph->ihl*4 - sizeof(struct iphdr); int optlen; while (l > 0) { switch (*optptr) { case IPOPT_END: return 0; case IPOPT_NOOP: l--; optptr++; continue; } optlen = optptr[1]; if (optlen<2 || optlen>l) return -EINVAL; switch (*optptr) { case IPOPT_SEC: case 0x85: /* Some "Extended Security" crap. */ case IPOPT_CIPSO: case IPOPT_RA: case 0x80|21: /* RFC1770 */ break; case IPOPT_LSRR: case IPOPT_SSRR: if (optlen < 6) return -EINVAL; memcpy(daddr, optptr+optlen-4, 4); fallthrough; default: memset(optptr, 0, optlen); } l -= optlen; optptr += optlen; } return 0; } static void ah_output_done(void *data, int err) { u8 *icv; struct iphdr *iph; struct sk_buff *skb = data; struct xfrm_state *x = skb_dst(skb)->xfrm; struct ah_data *ahp = x->data; struct iphdr *top_iph = ip_hdr(skb); struct ip_auth_hdr *ah = ip_auth_hdr(skb); int ihl = ip_hdrlen(skb); iph = AH_SKB_CB(skb)->tmp; icv = ah_tmp_icv(iph, ihl); memcpy(ah->auth_data, icv, ahp->icv_trunc_len); top_iph->tos = iph->tos; top_iph->ttl = iph->ttl; top_iph->frag_off = iph->frag_off; if (top_iph->ihl != 5) { top_iph->daddr = iph->daddr; memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); } kfree(AH_SKB_CB(skb)->tmp); xfrm_output_resume(skb->sk, skb, err); } static int ah_output(struct xfrm_state *x, struct sk_buff *skb) { int err; int nfrags; int ihl; u8 *icv; struct sk_buff *trailer; struct crypto_ahash *ahash; struct ahash_request *req; struct scatterlist *sg; struct iphdr *iph, *top_iph; struct ip_auth_hdr *ah; struct ah_data *ahp; int seqhi_len = 0; __be32 *seqhi; int sglists = 0; struct scatterlist *seqhisg; ahp = x->data; ahash = ahp->ahash; if ((err = skb_cow_data(skb, 0, &trailer)) < 0) goto out; nfrags = err; skb_push(skb, -skb_network_offset(skb)); ah = ip_auth_hdr(skb); ihl = ip_hdrlen(skb); if (x->props.flags & XFRM_STATE_ESN) { sglists = 1; seqhi_len = sizeof(*seqhi); } err = -ENOMEM; iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + seqhi_len); if (!iph) goto out; seqhi = (__be32 *)((char *)iph + ihl); icv = ah_tmp_icv(seqhi, seqhi_len); req = ah_tmp_req(ahash, icv); sg = ah_req_sg(ahash, req); seqhisg = sg + nfrags; memset(ah->auth_data, 0, ahp->icv_trunc_len); top_iph = ip_hdr(skb); iph->tos = top_iph->tos; iph->ttl = top_iph->ttl; iph->frag_off = top_iph->frag_off; if (top_iph->ihl != 5) { iph->daddr = top_iph->daddr; memcpy(iph+1, top_iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); err = ip_clear_mutable_options(top_iph, &top_iph->daddr); if (err) goto out_free; } ah->nexthdr = *skb_mac_header(skb); *skb_mac_header(skb) = IPPROTO_AH; top_iph->tos = 0; top_iph->tot_len = htons(skb->len); top_iph->frag_off = 0; top_iph->ttl = 0; top_iph->check = 0; if (x->props.flags & XFRM_STATE_ALIGN4) ah->hdrlen = (XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; else ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; ah->reserved = 0; ah->spi = x->id.spi; ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); sg_init_table(sg, nfrags + sglists); err = skb_to_sgvec_nomark(skb, sg, 0, skb->len); if (unlikely(err < 0)) goto out_free; if (x->props.flags & XFRM_STATE_ESN) { /* Attach seqhi sg right after packet payload */ *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi); sg_set_buf(seqhisg, seqhi, seqhi_len); } ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len); ahash_request_set_callback(req, 0, ah_output_done, skb); AH_SKB_CB(skb)->tmp = iph; err = crypto_ahash_digest(req); if (err) { if (err == -EINPROGRESS) goto out; if (err == -ENOSPC) err = NET_XMIT_DROP; goto out_free; } memcpy(ah->auth_data, icv, ahp->icv_trunc_len); top_iph->tos = iph->tos; top_iph->ttl = iph->ttl; top_iph->frag_off = iph->frag_off; if (top_iph->ihl != 5) { top_iph->daddr = iph->daddr; memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); } out_free: kfree(iph); out: return err; } static void ah_input_done(void *data, int err) { u8 *auth_data; u8 *icv; struct iphdr *work_iph; struct sk_buff *skb = data; struct xfrm_state *x = xfrm_input_state(skb); struct ah_data *ahp = x->data; struct ip_auth_hdr *ah = ip_auth_hdr(skb); int ihl = ip_hdrlen(skb); int ah_hlen = (ah->hdrlen + 2) << 2; if (err) goto out; work_iph = AH_SKB_CB(skb)->tmp; auth_data = ah_tmp_auth(work_iph, ihl); icv = ah_tmp_icv(auth_data, ahp->icv_trunc_len); err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0; if (err) goto out; err = ah->nexthdr; skb->network_header += ah_hlen; memcpy(skb_network_header(skb), work_iph, ihl); __skb_pull(skb, ah_hlen + ihl); if (x->props.mode == XFRM_MODE_TUNNEL) skb_reset_transport_header(skb); else skb_set_transport_header(skb, -ihl); out: kfree(AH_SKB_CB(skb)->tmp); xfrm_input_resume(skb, err); } static int ah_input(struct xfrm_state *x, struct sk_buff *skb) { int ah_hlen; int ihl; int nexthdr; int nfrags; u8 *auth_data; u8 *icv; struct sk_buff *trailer; struct crypto_ahash *ahash; struct ahash_request *req; struct scatterlist *sg; struct iphdr *iph, *work_iph; struct ip_auth_hdr *ah; struct ah_data *ahp; int err = -ENOMEM; int seqhi_len = 0; __be32 *seqhi; int sglists = 0; struct scatterlist *seqhisg; if (!pskb_may_pull(skb, sizeof(*ah))) goto out; ah = (struct ip_auth_hdr *)skb->data; ahp = x->data; ahash = ahp->ahash; nexthdr = ah->nexthdr; ah_hlen = (ah->hdrlen + 2) << 2; if (x->props.flags & XFRM_STATE_ALIGN4) { if (ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_full_len) && ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len)) goto out; } else { if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) && ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len)) goto out; } if (!pskb_may_pull(skb, ah_hlen)) goto out; /* We are going to _remove_ AH header to keep sockets happy, * so... Later this can change. */ if (skb_unclone(skb, GFP_ATOMIC)) goto out; skb->ip_summed = CHECKSUM_NONE; if ((err = skb_cow_data(skb, 0, &trailer)) < 0) goto out; nfrags = err; ah = (struct ip_auth_hdr *)skb->data; iph = ip_hdr(skb); ihl = ip_hdrlen(skb); if (x->props.flags & XFRM_STATE_ESN) { sglists = 1; seqhi_len = sizeof(*seqhi); } work_iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + ahp->icv_trunc_len + seqhi_len); if (!work_iph) { err = -ENOMEM; goto out; } seqhi = (__be32 *)((char *)work_iph + ihl); auth_data = ah_tmp_auth(seqhi, seqhi_len); icv = ah_tmp_icv(auth_data, ahp->icv_trunc_len); req = ah_tmp_req(ahash, icv); sg = ah_req_sg(ahash, req); seqhisg = sg + nfrags; memcpy(work_iph, iph, ihl); memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); memset(ah->auth_data, 0, ahp->icv_trunc_len); iph->ttl = 0; iph->tos = 0; iph->frag_off = 0; iph->check = 0; if (ihl > sizeof(*iph)) { __be32 dummy; err = ip_clear_mutable_options(iph, &dummy); if (err) goto out_free; } skb_push(skb, ihl); sg_init_table(sg, nfrags + sglists); err = skb_to_sgvec_nomark(skb, sg, 0, skb->len); if (unlikely(err < 0)) goto out_free; if (x->props.flags & XFRM_STATE_ESN) { /* Attach seqhi sg right after packet payload */ *seqhi = XFRM_SKB_CB(skb)->seq.input.hi; sg_set_buf(seqhisg, seqhi, seqhi_len); } ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len); ahash_request_set_callback(req, 0, ah_input_done, skb); AH_SKB_CB(skb)->tmp = work_iph; err = crypto_ahash_digest(req); if (err) { if (err == -EINPROGRESS) goto out; goto out_free; } err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0; if (err) goto out_free; skb->network_header += ah_hlen; memcpy(skb_network_header(skb), work_iph, ihl); __skb_pull(skb, ah_hlen + ihl); if (x->props.mode == XFRM_MODE_TUNNEL) skb_reset_transport_header(skb); else skb_set_transport_header(skb, -ihl); err = nexthdr; out_free: kfree (work_iph); out: return err; } static int ah4_err(struct sk_buff *skb, u32 info) { struct net *net = dev_net(skb->dev); const struct iphdr *iph = (const struct iphdr *)skb->data; struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return 0; break; case ICMP_REDIRECT: break; default: return 0; } x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); if (!x) return 0; if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, IPPROTO_AH); else ipv4_redirect(skb, net, 0, IPPROTO_AH); xfrm_state_put(x); return 0; } static int ah_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { struct ah_data *ahp = NULL; struct xfrm_algo_desc *aalg_desc; struct crypto_ahash *ahash; if (!x->aalg) { NL_SET_ERR_MSG(extack, "AH requires a state with an AUTH algorithm"); goto error; } if (x->encap) { NL_SET_ERR_MSG(extack, "AH is not compatible with encapsulation"); goto error; } ahp = kzalloc(sizeof(*ahp), GFP_KERNEL); if (!ahp) return -ENOMEM; ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0); if (IS_ERR(ahash)) { NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations"); goto error; } ahp->ahash = ahash; if (crypto_ahash_setkey(ahash, x->aalg->alg_key, (x->aalg->alg_key_len + 7) / 8)) { NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations"); goto error; } /* * Lookup the algorithm description maintained by xfrm_algo, * verify crypto transform properties, and store information * we need for AH processing. This lookup cannot fail here * after a successful crypto_alloc_ahash(). */ aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0); BUG_ON(!aalg_desc); if (aalg_desc->uinfo.auth.icv_fullbits/8 != crypto_ahash_digestsize(ahash)) { NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations"); goto error; } ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; ahp->icv_trunc_len = x->aalg->alg_trunc_len/8; if (x->props.flags & XFRM_STATE_ALIGN4) x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len); else x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len); if (x->props.mode == XFRM_MODE_TUNNEL) x->props.header_len += sizeof(struct iphdr); x->data = ahp; return 0; error: if (ahp) { crypto_free_ahash(ahp->ahash); kfree(ahp); } return -EINVAL; } static void ah_destroy(struct xfrm_state *x) { struct ah_data *ahp = x->data; if (!ahp) return; crypto_free_ahash(ahp->ahash); kfree(ahp); } static int ah4_rcv_cb(struct sk_buff *skb, int err) { return 0; } static const struct xfrm_type ah_type = { .owner = THIS_MODULE, .proto = IPPROTO_AH, .flags = XFRM_TYPE_REPLAY_PROT, .init_state = ah_init_state, .destructor = ah_destroy, .input = ah_input, .output = ah_output }; static struct xfrm4_protocol ah4_protocol = { .handler = xfrm4_rcv, .input_handler = xfrm_input, .cb_handler = ah4_rcv_cb, .err_handler = ah4_err, .priority = 0, }; static int __init ah4_init(void) { if (xfrm_register_type(&ah_type, AF_INET) < 0) { pr_info("%s: can't add xfrm type\n", __func__); return -EAGAIN; } if (xfrm4_protocol_register(&ah4_protocol, IPPROTO_AH) < 0) { pr_info("%s: can't add protocol\n", __func__); xfrm_unregister_type(&ah_type, AF_INET); return -EAGAIN; } return 0; } static void __exit ah4_fini(void) { if (xfrm4_protocol_deregister(&ah4_protocol, IPPROTO_AH) < 0) pr_info("%s: can't remove protocol\n", __func__); xfrm_unregister_type(&ah_type, AF_INET); } module_init(ah4_init); module_exit(ah4_fini); MODULE_DESCRIPTION("IPv4 AH transformation library"); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_AH);
80 79 3 1 13 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _LINUX_RSTREASON_H #define _LINUX_RSTREASON_H #include <net/dropreason-core.h> #include <uapi/linux/mptcp.h> #define DEFINE_RST_REASON(FN, FNe) \ FN(NOT_SPECIFIED) \ FN(NO_SOCKET) \ FN(TCP_INVALID_ACK_SEQUENCE) \ FN(TCP_RFC7323_PAWS) \ FN(TCP_TOO_OLD_ACK) \ FN(TCP_ACK_UNSENT_DATA) \ FN(TCP_FLAGS) \ FN(TCP_OLD_ACK) \ FN(TCP_ABORT_ON_DATA) \ FN(TCP_TIMEWAIT_SOCKET) \ FN(INVALID_SYN) \ FN(TCP_ABORT_ON_CLOSE) \ FN(TCP_ABORT_ON_LINGER) \ FN(TCP_ABORT_ON_MEMORY) \ FN(TCP_STATE) \ FN(TCP_KEEPALIVE_TIMEOUT) \ FN(TCP_DISCONNECT_WITH_DATA) \ FN(MPTCP_RST_EUNSPEC) \ FN(MPTCP_RST_EMPTCP) \ FN(MPTCP_RST_ERESOURCE) \ FN(MPTCP_RST_EPROHIBIT) \ FN(MPTCP_RST_EWQ2BIG) \ FN(MPTCP_RST_EBADPERF) \ FN(MPTCP_RST_EMIDDLEBOX) \ FN(ERROR) \ FNe(MAX) /** * enum sk_rst_reason - the reasons of socket reset * * The reasons of sk reset, which are used in TCP/MPTCP protocols. * * There are three parts in order: * 1) skb drop reasons: relying on drop reasons for such as passive reset * 2) independent reset reasons: such as active reset reasons * 3) reset reasons in MPTCP: only for MPTCP use */ enum sk_rst_reason { /* Refer to include/net/dropreason-core.h * Rely on skb drop reasons because it indicates exactly why RST * could happen. */ /** @SK_RST_REASON_NOT_SPECIFIED: reset reason is not specified */ SK_RST_REASON_NOT_SPECIFIED, /** @SK_RST_REASON_NO_SOCKET: no valid socket that can be used */ SK_RST_REASON_NO_SOCKET, /** * @SK_RST_REASON_TCP_INVALID_ACK_SEQUENCE: Not acceptable ACK SEQ * field because ack sequence is not in the window between snd_una * and snd_nxt */ SK_RST_REASON_TCP_INVALID_ACK_SEQUENCE, /** * @SK_RST_REASON_TCP_RFC7323_PAWS: PAWS check, corresponding to * LINUX_MIB_PAWSESTABREJECTED, LINUX_MIB_PAWSACTIVEREJECTED */ SK_RST_REASON_TCP_RFC7323_PAWS, /** @SK_RST_REASON_TCP_TOO_OLD_ACK: TCP ACK is too old */ SK_RST_REASON_TCP_TOO_OLD_ACK, /** * @SK_RST_REASON_TCP_ACK_UNSENT_DATA: TCP ACK for data we haven't * sent yet */ SK_RST_REASON_TCP_ACK_UNSENT_DATA, /** @SK_RST_REASON_TCP_FLAGS: TCP flags invalid */ SK_RST_REASON_TCP_FLAGS, /** @SK_RST_REASON_TCP_OLD_ACK: TCP ACK is old, but in window */ SK_RST_REASON_TCP_OLD_ACK, /** * @SK_RST_REASON_TCP_ABORT_ON_DATA: abort on data * corresponding to LINUX_MIB_TCPABORTONDATA */ SK_RST_REASON_TCP_ABORT_ON_DATA, /* Here start with the independent reasons */ /** @SK_RST_REASON_TCP_TIMEWAIT_SOCKET: happen on the timewait socket */ SK_RST_REASON_TCP_TIMEWAIT_SOCKET, /** * @SK_RST_REASON_INVALID_SYN: receive bad syn packet * RFC 793 says if the state is not CLOSED/LISTEN/SYN-SENT then * "fourth, check the SYN bit,...If the SYN is in the window it is * an error, send a reset" */ SK_RST_REASON_INVALID_SYN, /** * @SK_RST_REASON_TCP_ABORT_ON_CLOSE: abort on close * corresponding to LINUX_MIB_TCPABORTONCLOSE */ SK_RST_REASON_TCP_ABORT_ON_CLOSE, /** * @SK_RST_REASON_TCP_ABORT_ON_LINGER: abort on linger * corresponding to LINUX_MIB_TCPABORTONLINGER */ SK_RST_REASON_TCP_ABORT_ON_LINGER, /** * @SK_RST_REASON_TCP_ABORT_ON_MEMORY: abort on memory * corresponding to LINUX_MIB_TCPABORTONMEMORY */ SK_RST_REASON_TCP_ABORT_ON_MEMORY, /** * @SK_RST_REASON_TCP_STATE: abort on tcp state * Please see RFC 9293 for all possible reset conditions */ SK_RST_REASON_TCP_STATE, /** * @SK_RST_REASON_TCP_KEEPALIVE_TIMEOUT: time to timeout * When we have already run out of all the chances, which means * keepalive timeout, we have to reset the connection */ SK_RST_REASON_TCP_KEEPALIVE_TIMEOUT, /** * @SK_RST_REASON_TCP_DISCONNECT_WITH_DATA: disconnect when write * queue is not empty * It means user has written data into the write queue when doing * disconnecting, so we have to send an RST. */ SK_RST_REASON_TCP_DISCONNECT_WITH_DATA, /* Copy from include/uapi/linux/mptcp.h. * These reset fields will not be changed since they adhere to * RFC 8684. So do not touch them. I'm going to list each definition * of them respectively. */ /** * @SK_RST_REASON_MPTCP_RST_EUNSPEC: Unspecified error. * This is the default error; it implies that the subflow is no * longer available. The presence of this option shows that the * RST was generated by an MPTCP-aware device. */ SK_RST_REASON_MPTCP_RST_EUNSPEC, /** * @SK_RST_REASON_MPTCP_RST_EMPTCP: MPTCP-specific error. * An error has been detected in the processing of MPTCP options. * This is the usual reason code to return in the cases where a RST * is being sent to close a subflow because of an invalid response. */ SK_RST_REASON_MPTCP_RST_EMPTCP, /** * @SK_RST_REASON_MPTCP_RST_ERESOURCE: Lack of resources. * This code indicates that the sending host does not have enough * resources to support the terminated subflow. */ SK_RST_REASON_MPTCP_RST_ERESOURCE, /** * @SK_RST_REASON_MPTCP_RST_EPROHIBIT: Administratively prohibited. * This code indicates that the requested subflow is prohibited by * the policies of the sending host. */ SK_RST_REASON_MPTCP_RST_EPROHIBIT, /** * @SK_RST_REASON_MPTCP_RST_EWQ2BIG: Too much outstanding data. * This code indicates that there is an excessive amount of data * that needs to be transmitted over the terminated subflow while * having already been acknowledged over one or more other subflows. * This may occur if a path has been unavailable for a short period * and it is more efficient to reset and start again than it is to * retransmit the queued data. */ SK_RST_REASON_MPTCP_RST_EWQ2BIG, /** * @SK_RST_REASON_MPTCP_RST_EBADPERF: Unacceptable performance. * This code indicates that the performance of this subflow was * too low compared to the other subflows of this Multipath TCP * connection. */ SK_RST_REASON_MPTCP_RST_EBADPERF, /** * @SK_RST_REASON_MPTCP_RST_EMIDDLEBOX: Middlebox interference. * Middlebox interference has been detected over this subflow, * making MPTCP signaling invalid. For example, this may be sent * if the checksum does not validate. */ SK_RST_REASON_MPTCP_RST_EMIDDLEBOX, /** @SK_RST_REASON_ERROR: unexpected error happens */ SK_RST_REASON_ERROR, /** * @SK_RST_REASON_MAX: Maximum of socket reset reasons. * It shouldn't be used as a real 'reason'. */ SK_RST_REASON_MAX, }; /* Convert skb drop reasons to enum sk_rst_reason type */ static inline enum sk_rst_reason sk_rst_convert_drop_reason(enum skb_drop_reason reason) { switch (reason) { case SKB_DROP_REASON_NOT_SPECIFIED: return SK_RST_REASON_NOT_SPECIFIED; case SKB_DROP_REASON_NO_SOCKET: return SK_RST_REASON_NO_SOCKET; case SKB_DROP_REASON_TCP_INVALID_ACK_SEQUENCE: return SK_RST_REASON_TCP_INVALID_ACK_SEQUENCE; case SKB_DROP_REASON_TCP_RFC7323_PAWS: return SK_RST_REASON_TCP_RFC7323_PAWS; case SKB_DROP_REASON_TCP_TOO_OLD_ACK: return SK_RST_REASON_TCP_TOO_OLD_ACK; case SKB_DROP_REASON_TCP_ACK_UNSENT_DATA: return SK_RST_REASON_TCP_ACK_UNSENT_DATA; case SKB_DROP_REASON_TCP_FLAGS: return SK_RST_REASON_TCP_FLAGS; case SKB_DROP_REASON_TCP_OLD_ACK: return SK_RST_REASON_TCP_OLD_ACK; case SKB_DROP_REASON_TCP_ABORT_ON_DATA: return SK_RST_REASON_TCP_ABORT_ON_DATA; default: /* If we don't have our own corresponding reason */ return SK_RST_REASON_NOT_SPECIFIED; } } #endif
13 4 10 11 2 9 6 6 1 5 39 38 39 15 4 19 1 1 20 3 18 17 1 17 1 1 8 3 1 4 2 1 2 1 2 1 1 1 3 1 1 1 1 40 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> * (C) 2006-2010 Patrick McHardy <kaber@trash.net> */ #include <linux/types.h> #include <linux/timer.h> #include <linux/netfilter.h> #include <linux/in.h> #include <linux/icmp.h> #include <linux/seq_file.h> #include <net/ip.h> #include <net/checksum.h> #include <linux/netfilter_ipv4.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_log.h> #include "nf_internals.h" static const unsigned int nf_ct_icmp_timeout = 30*HZ; bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple) { const struct icmphdr *hp; struct icmphdr _hdr; hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (hp == NULL) return false; tuple->dst.u.icmp.type = hp->type; tuple->src.u.icmp.id = hp->un.echo.id; tuple->dst.u.icmp.code = hp->code; return true; } /* Add 1; spaces filled with 0. */ static const u_int8_t invmap[] = { [ICMP_ECHO] = ICMP_ECHOREPLY + 1, [ICMP_ECHOREPLY] = ICMP_ECHO + 1, [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1, [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1, [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1, [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1, [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1 }; bool nf_conntrack_invert_icmp_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig) { if (orig->dst.u.icmp.type >= sizeof(invmap) || !invmap[orig->dst.u.icmp.type]) return false; tuple->src.u.icmp.id = orig->src.u.icmp.id; tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1; tuple->dst.u.icmp.code = orig->dst.u.icmp.code; return true; } /* Returns verdict for packet, or -1 for invalid. */ int nf_conntrack_icmp_packet(struct nf_conn *ct, struct sk_buff *skb, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { /* Do not immediately delete the connection after the first successful reply to avoid excessive conntrackd traffic and also to handle correctly ICMP echo reply duplicates. */ unsigned int *timeout = nf_ct_timeout_lookup(ct); static const u_int8_t valid_new[] = { [ICMP_ECHO] = 1, [ICMP_TIMESTAMP] = 1, [ICMP_INFO_REQUEST] = 1, [ICMP_ADDRESS] = 1 }; if (state->pf != NFPROTO_IPV4) return -NF_ACCEPT; if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) || !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) { /* Can't create a new ICMP `conn' with this. */ pr_debug("icmp: can't create new conn with type %u\n", ct->tuplehash[0].tuple.dst.u.icmp.type); nf_ct_dump_tuple_ip(&ct->tuplehash[0].tuple); return -NF_ACCEPT; } if (!timeout) timeout = &nf_icmp_pernet(nf_ct_net(ct))->timeout; nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); return NF_ACCEPT; } /* Check inner header is related to any of the existing connections */ int nf_conntrack_inet_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state, u8 l4proto, union nf_inet_addr *outer_daddr) { struct nf_conntrack_tuple innertuple, origtuple; const struct nf_conntrack_tuple_hash *h; const struct nf_conntrack_zone *zone; enum ip_conntrack_info ctinfo; struct nf_conntrack_zone tmp; union nf_inet_addr *ct_daddr; enum ip_conntrack_dir dir; struct nf_conn *ct; WARN_ON(skb_nfct(skb)); zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); /* Are they talking about one of our connections? */ if (!nf_ct_get_tuplepr(skb, dataoff, state->pf, state->net, &origtuple)) return -NF_ACCEPT; /* Ordinarily, we'd expect the inverted tupleproto, but it's been preserved inside the ICMP. */ if (!nf_ct_invert_tuple(&innertuple, &origtuple)) return -NF_ACCEPT; h = nf_conntrack_find_get(state->net, zone, &innertuple); if (!h) return -NF_ACCEPT; /* Consider: A -> T (=This machine) -> B * Conntrack entry will look like this: * Original: A->B * Reply: B->T (SNAT case) OR A * * When this function runs, we got packet that looks like this: * iphdr|icmphdr|inner_iphdr|l4header (tcp, udp, ..). * * Above nf_conntrack_find_get() makes lookup based on inner_hdr, * so we should expect that destination of the found connection * matches outer header destination address. * * In above example, we can consider these two cases: * 1. Error coming in reply direction from B or M (middle box) to * T (SNAT case) or A. * Inner saddr will be B, dst will be T or A. * The found conntrack will be reply tuple (B->T/A). * 2. Error coming in original direction from A or M to B. * Inner saddr will be A, inner daddr will be B. * The found conntrack will be original tuple (A->B). * * In both cases, conntrack[dir].dst == inner.dst. * * A bogus packet could look like this: * Inner: B->T * Outer: B->X (other machine reachable by T). * * In this case, lookup yields connection A->B and will * set packet from B->X as *RELATED*, even though no connection * from X was ever seen. */ ct = nf_ct_tuplehash_to_ctrack(h); dir = NF_CT_DIRECTION(h); ct_daddr = &ct->tuplehash[dir].tuple.dst.u3; if (!nf_inet_addr_cmp(outer_daddr, ct_daddr)) { if (state->pf == AF_INET) { nf_l4proto_log_invalid(skb, state, l4proto, "outer daddr %pI4 != inner %pI4", &outer_daddr->ip, &ct_daddr->ip); } else if (state->pf == AF_INET6) { nf_l4proto_log_invalid(skb, state, l4proto, "outer daddr %pI6 != inner %pI6", &outer_daddr->ip6, &ct_daddr->ip6); } nf_ct_put(ct); return -NF_ACCEPT; } ctinfo = IP_CT_RELATED; if (dir == IP_CT_DIR_REPLY) ctinfo += IP_CT_IS_REPLY; /* Update skb to refer to this connection */ nf_ct_set(skb, ct, ctinfo); return NF_ACCEPT; } static void icmp_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { nf_l4proto_log_invalid(skb, state, IPPROTO_ICMP, "%s", msg); } /* Small and modified version of icmp_rcv */ int nf_conntrack_icmpv4_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { union nf_inet_addr outer_daddr; const struct icmphdr *icmph; struct icmphdr _ih; /* Not enough header? */ icmph = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih); if (icmph == NULL) { icmp_error_log(skb, state, "short packet"); return -NF_ACCEPT; } /* See nf_conntrack_proto_tcp.c */ if (state->net->ct.sysctl_checksum && state->hook == NF_INET_PRE_ROUTING && nf_ip_checksum(skb, state->hook, dataoff, IPPROTO_ICMP)) { icmp_error_log(skb, state, "bad hw icmp checksum"); return -NF_ACCEPT; } /* * 18 is the highest 'known' ICMP type. Anything else is a mystery * * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently * discarded. */ if (icmph->type > NR_ICMP_TYPES) { icmp_error_log(skb, state, "invalid icmp type"); return -NF_ACCEPT; } /* Need to track icmp error message? */ if (!icmp_is_err(icmph->type)) return NF_ACCEPT; memset(&outer_daddr, 0, sizeof(outer_daddr)); outer_daddr.ip = ip_hdr(skb)->daddr; dataoff += sizeof(*icmph); return nf_conntrack_inet_error(tmpl, skb, dataoff, state, IPPROTO_ICMP, &outer_daddr); } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> static int icmp_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *t) { if (nla_put_be16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id) || nla_put_u8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type) || nla_put_u8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static const struct nla_policy icmp_nla_policy[CTA_PROTO_MAX+1] = { [CTA_PROTO_ICMP_TYPE] = { .type = NLA_U8 }, [CTA_PROTO_ICMP_CODE] = { .type = NLA_U8 }, [CTA_PROTO_ICMP_ID] = { .type = NLA_U16 }, }; static int icmp_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *tuple, u_int32_t flags) { if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_TYPE)) { if (!tb[CTA_PROTO_ICMP_TYPE]) return -EINVAL; tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]); if (tuple->dst.u.icmp.type >= sizeof(invmap) || !invmap[tuple->dst.u.icmp.type]) return -EINVAL; } if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_CODE)) { if (!tb[CTA_PROTO_ICMP_CODE]) return -EINVAL; tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]); } if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_ID)) { if (!tb[CTA_PROTO_ICMP_ID]) return -EINVAL; tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]); } return 0; } static unsigned int icmp_nlattr_tuple_size(void) { static unsigned int size __read_mostly; if (!size) size = nla_policy_len(icmp_nla_policy, CTA_PROTO_MAX + 1); return size; } #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeout = data; struct nf_icmp_net *in = nf_icmp_pernet(net); if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) { if (!timeout) timeout = &in->timeout; *timeout = ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ; } else if (timeout) { /* Set default ICMP timeout. */ *timeout = in->timeout; } return 0; } static int icmp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) { const unsigned int *timeout = data; if (nla_put_be32(skb, CTA_TIMEOUT_ICMP_TIMEOUT, htonl(*timeout / HZ))) goto nla_put_failure; return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = { [CTA_TIMEOUT_ICMP_TIMEOUT] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ void nf_conntrack_icmp_init_net(struct net *net) { struct nf_icmp_net *in = nf_icmp_pernet(net); in->timeout = nf_ct_icmp_timeout; } const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp = { .l4proto = IPPROTO_ICMP, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = icmp_tuple_to_nlattr, .nlattr_tuple_size = icmp_nlattr_tuple_size, .nlattr_to_tuple = icmp_nlattr_to_tuple, .nla_policy = icmp_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = icmp_timeout_nlattr_to_obj, .obj_to_nlattr = icmp_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_ICMP_MAX, .obj_size = sizeof(unsigned int), .nla_policy = icmp_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ };
12 34 34 34 22 5 27 26 12 12 12 2 34 24 9 34 34 27 17 16 27 12 25 27 27 27 1 1 34 27 27 34 34 9 7 3 27 27 4 24 3 5 26 11 26 5 13 2 11 3 11 2 11 11 1 11 10 1 9 2 8 3 3 11 3 8 8 1 3 3 11 10 11 8 3 3 8 8 8 8 11 2 2 2 2 2 11 11 11 1 11 11 10 2 2 2 2 2 2 2 2 2 11 11 2 20 21 8 7 8 8 8 23 23 17 12 2 2 7 16 8 8 23 23 23 22 8 8 8 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) International Business Machines Corp., 2000-2004 * Portions Copyright (C) Christoph Hellwig, 2001-2002 */ /* * jfs_logmgr.c: log manager * * for related information, see transaction manager (jfs_txnmgr.c), and * recovery manager (jfs_logredo.c). * * note: for detail, RTFS. * * log buffer manager: * special purpose buffer manager supporting log i/o requirements. * per log serial pageout of logpage * queuing i/o requests and redrive i/o at iodone * maintain current logpage buffer * no caching since append only * appropriate jfs buffer cache buffers as needed * * group commit: * transactions which wrote COMMIT records in the same in-memory * log page during the pageout of previous/current log page(s) are * committed together by the pageout of the page. * * TBD lazy commit: * transactions are committed asynchronously when the log page * containing it COMMIT is paged out when it becomes full; * * serialization: * . a per log lock serialize log write. * . a per log lock serialize group commit. * . a per log lock serialize log open/close; * * TBD log integrity: * careful-write (ping-pong) of last logpage to recover from crash * in overwrite. * detection of split (out-of-order) write of physical sectors * of last logpage via timestamp at end of each sector * with its mirror data array at trailer). * * alternatives: * lsn - 64-bit monotonically increasing integer vs * 32-bit lspn and page eor. */ #include <linux/fs.h> #include <linux/blkdev.h> #include <linux/interrupt.h> #include <linux/completion.h> #include <linux/kthread.h> #include <linux/buffer_head.h> /* for sync_blockdev() */ #include <linux/bio.h> #include <linux/freezer.h> #include <linux/export.h> #include <linux/delay.h> #include <linux/mutex.h> #include <linux/seq_file.h> #include <linux/slab.h> #include "jfs_incore.h" #include "jfs_filsys.h" #include "jfs_metapage.h" #include "jfs_superblock.h" #include "jfs_txnmgr.h" #include "jfs_debug.h" /* * lbuf's ready to be redriven. Protected by log_redrive_lock (jfsIO thread) */ static struct lbuf *log_redrive_list; static DEFINE_SPINLOCK(log_redrive_lock); /* * log read/write serialization (per log) */ #define LOG_LOCK_INIT(log) mutex_init(&(log)->loglock) #define LOG_LOCK(log) mutex_lock(&((log)->loglock)) #define LOG_UNLOCK(log) mutex_unlock(&((log)->loglock)) /* * log group commit serialization (per log) */ #define LOGGC_LOCK_INIT(log) spin_lock_init(&(log)->gclock) #define LOGGC_LOCK(log) spin_lock_irq(&(log)->gclock) #define LOGGC_UNLOCK(log) spin_unlock_irq(&(log)->gclock) #define LOGGC_WAKEUP(tblk) wake_up_all(&(tblk)->gcwait) /* * log sync serialization (per log) */ #define LOGSYNC_DELTA(logsize) min((logsize)/8, 128*LOGPSIZE) #define LOGSYNC_BARRIER(logsize) ((logsize)/4) /* #define LOGSYNC_DELTA(logsize) min((logsize)/4, 256*LOGPSIZE) #define LOGSYNC_BARRIER(logsize) ((logsize)/2) */ /* * log buffer cache synchronization */ static DEFINE_SPINLOCK(jfsLCacheLock); #define LCACHE_LOCK(flags) spin_lock_irqsave(&jfsLCacheLock, flags) #define LCACHE_UNLOCK(flags) spin_unlock_irqrestore(&jfsLCacheLock, flags) /* * See __SLEEP_COND in jfs_locks.h */ #define LCACHE_SLEEP_COND(wq, cond, flags) \ do { \ if (cond) \ break; \ __SLEEP_COND(wq, cond, LCACHE_LOCK(flags), LCACHE_UNLOCK(flags)); \ } while (0) #define LCACHE_WAKEUP(event) wake_up(event) /* * lbuf buffer cache (lCache) control */ /* log buffer manager pageout control (cumulative, inclusive) */ #define lbmREAD 0x0001 #define lbmWRITE 0x0002 /* enqueue at tail of write queue; * init pageout if at head of queue; */ #define lbmRELEASE 0x0004 /* remove from write queue * at completion of pageout; * do not free/recycle it yet: * caller will free it; */ #define lbmSYNC 0x0008 /* do not return to freelist * when removed from write queue; */ #define lbmFREE 0x0010 /* return to freelist * at completion of pageout; * the buffer may be recycled; */ #define lbmDONE 0x0020 #define lbmERROR 0x0040 #define lbmGC 0x0080 /* lbmIODone to perform post-GC processing * of log page */ #define lbmDIRECT 0x0100 /* * Global list of active external journals */ static LIST_HEAD(jfs_external_logs); static struct jfs_log *dummy_log; static DEFINE_MUTEX(jfs_log_mutex); /* * forward references */ static int lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, struct tlock * tlck); static int lmNextPage(struct jfs_log * log); static int lmLogFileSystem(struct jfs_log * log, struct jfs_sb_info *sbi, int activate); static int open_inline_log(struct super_block *sb); static int open_dummy_log(struct super_block *sb); static int lbmLogInit(struct jfs_log * log); static void lbmLogShutdown(struct jfs_log * log); static struct lbuf *lbmAllocate(struct jfs_log * log, int); static void lbmFree(struct lbuf * bp); static void lbmfree(struct lbuf * bp); static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp); static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag, int cant_block); static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag); static int lbmIOWait(struct lbuf * bp, int flag); static bio_end_io_t lbmIODone; static void lbmStartIO(struct lbuf * bp); static void lmGCwrite(struct jfs_log * log, int cant_block); static int lmLogSync(struct jfs_log * log, int hard_sync); /* * statistics */ #ifdef CONFIG_JFS_STATISTICS static struct lmStat { uint commit; /* # of commit */ uint pagedone; /* # of page written */ uint submitted; /* # of pages submitted */ uint full_page; /* # of full pages submitted */ uint partial_page; /* # of partial pages submitted */ } lmStat; #endif static void write_special_inodes(struct jfs_log *log, int (*writer)(struct address_space *)) { struct jfs_sb_info *sbi; list_for_each_entry(sbi, &log->sb_list, log_list) { writer(sbi->ipbmap->i_mapping); writer(sbi->ipimap->i_mapping); writer(sbi->direct_inode->i_mapping); } } /* * NAME: lmLog() * * FUNCTION: write a log record; * * PARAMETER: * * RETURN: lsn - offset to the next log record to write (end-of-log); * -1 - error; * * note: todo: log error handler */ int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, struct tlock * tlck) { int lsn; int diffp, difft; struct metapage *mp = NULL; unsigned long flags; jfs_info("lmLog: log:0x%p tblk:0x%p, lrd:0x%p tlck:0x%p", log, tblk, lrd, tlck); LOG_LOCK(log); /* log by (out-of-transaction) JFS ? */ if (tblk == NULL) goto writeRecord; /* log from page ? */ if (tlck == NULL || tlck->type & tlckBTROOT || (mp = tlck->mp) == NULL) goto writeRecord; /* * initialize/update page/transaction recovery lsn */ lsn = log->lsn; LOGSYNC_LOCK(log, flags); /* * initialize page lsn if first log write of the page */ if (mp->lsn == 0) { mp->log = log; mp->lsn = lsn; log->count++; /* insert page at tail of logsynclist */ list_add_tail(&mp->synclist, &log->synclist); } /* * initialize/update lsn of tblock of the page * * transaction inherits oldest lsn of pages associated * with allocation/deallocation of resources (their * log records are used to reconstruct allocation map * at recovery time: inode for inode allocation map, * B+-tree index of extent descriptors for block * allocation map); * allocation map pages inherit transaction lsn at * commit time to allow forwarding log syncpt past log * records associated with allocation/deallocation of * resources only after persistent map of these map pages * have been updated and propagated to home. */ /* * initialize transaction lsn: */ if (tblk->lsn == 0) { /* inherit lsn of its first page logged */ tblk->lsn = mp->lsn; log->count++; /* insert tblock after the page on logsynclist */ list_add(&tblk->synclist, &mp->synclist); } /* * update transaction lsn: */ else { /* inherit oldest/smallest lsn of page */ logdiff(diffp, mp->lsn, log); logdiff(difft, tblk->lsn, log); if (diffp < difft) { /* update tblock lsn with page lsn */ tblk->lsn = mp->lsn; /* move tblock after page on logsynclist */ list_move(&tblk->synclist, &mp->synclist); } } LOGSYNC_UNLOCK(log, flags); /* * write the log record */ writeRecord: lsn = lmWriteRecord(log, tblk, lrd, tlck); /* * forward log syncpt if log reached next syncpt trigger */ logdiff(diffp, lsn, log); if (diffp >= log->nextsync) lsn = lmLogSync(log, 0); /* update end-of-log lsn */ log->lsn = lsn; LOG_UNLOCK(log); /* return end-of-log address */ return lsn; } /* * NAME: lmWriteRecord() * * FUNCTION: move the log record to current log page * * PARAMETER: cd - commit descriptor * * RETURN: end-of-log address * * serialization: LOG_LOCK() held on entry/exit */ static int lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, struct tlock * tlck) { int lsn = 0; /* end-of-log address */ struct lbuf *bp; /* dst log page buffer */ struct logpage *lp; /* dst log page */ caddr_t dst; /* destination address in log page */ int dstoffset; /* end-of-log offset in log page */ int freespace; /* free space in log page */ caddr_t p; /* src meta-data page */ caddr_t src; int srclen; int nbytes; /* number of bytes to move */ int i; int len; struct linelock *linelock; struct lv *lv; struct lvd *lvd; int l2linesize; len = 0; /* retrieve destination log page to write */ bp = (struct lbuf *) log->bp; lp = (struct logpage *) bp->l_ldata; dstoffset = log->eor; /* any log data to write ? */ if (tlck == NULL) goto moveLrd; /* * move log record data */ /* retrieve source meta-data page to log */ if (tlck->flag & tlckPAGELOCK) { p = (caddr_t) (tlck->mp->data); linelock = (struct linelock *) & tlck->lock; } /* retrieve source in-memory inode to log */ else if (tlck->flag & tlckINODELOCK) { if (tlck->type & tlckDTREE) p = (caddr_t) &JFS_IP(tlck->ip)->i_dtroot; else p = (caddr_t) &JFS_IP(tlck->ip)->i_xtroot; linelock = (struct linelock *) & tlck->lock; } else { jfs_err("lmWriteRecord: UFO tlck:0x%p", tlck); return 0; /* Probably should trap */ } l2linesize = linelock->l2linesize; moveData: ASSERT(linelock->index <= linelock->maxcnt); lv = linelock->lv; for (i = 0; i < linelock->index; i++, lv++) { if (lv->length == 0) continue; /* is page full ? */ if (dstoffset >= LOGPSIZE - LOGPTLRSIZE) { /* page become full: move on to next page */ lmNextPage(log); bp = log->bp; lp = (struct logpage *) bp->l_ldata; dstoffset = LOGPHDRSIZE; } /* * move log vector data */ src = (u8 *) p + (lv->offset << l2linesize); srclen = lv->length << l2linesize; len += srclen; while (srclen > 0) { freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset; nbytes = min(freespace, srclen); dst = (caddr_t) lp + dstoffset; memcpy(dst, src, nbytes); dstoffset += nbytes; /* is page not full ? */ if (dstoffset < LOGPSIZE - LOGPTLRSIZE) break; /* page become full: move on to next page */ lmNextPage(log); bp = (struct lbuf *) log->bp; lp = (struct logpage *) bp->l_ldata; dstoffset = LOGPHDRSIZE; srclen -= nbytes; src += nbytes; } /* * move log vector descriptor */ len += 4; lvd = (struct lvd *) ((caddr_t) lp + dstoffset); lvd->offset = cpu_to_le16(lv->offset); lvd->length = cpu_to_le16(lv->length); dstoffset += 4; jfs_info("lmWriteRecord: lv offset:%d length:%d", lv->offset, lv->length); } if ((i = linelock->next)) { linelock = (struct linelock *) lid_to_tlock(i); goto moveData; } /* * move log record descriptor */ moveLrd: lrd->length = cpu_to_le16(len); src = (caddr_t) lrd; srclen = LOGRDSIZE; while (srclen > 0) { freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset; nbytes = min(freespace, srclen); dst = (caddr_t) lp + dstoffset; memcpy(dst, src, nbytes); dstoffset += nbytes; srclen -= nbytes; /* are there more to move than freespace of page ? */ if (srclen) goto pageFull; /* * end of log record descriptor */ /* update last log record eor */ log->eor = dstoffset; bp->l_eor = dstoffset; lsn = (log->page << L2LOGPSIZE) + dstoffset; if (lrd->type & cpu_to_le16(LOG_COMMIT)) { tblk->clsn = lsn; jfs_info("wr: tclsn:0x%x, beor:0x%x", tblk->clsn, bp->l_eor); INCREMENT(lmStat.commit); /* # of commit */ /* * enqueue tblock for group commit: * * enqueue tblock of non-trivial/synchronous COMMIT * at tail of group commit queue * (trivial/asynchronous COMMITs are ignored by * group commit.) */ LOGGC_LOCK(log); /* init tblock gc state */ tblk->flag = tblkGC_QUEUE; tblk->bp = log->bp; tblk->pn = log->page; tblk->eor = log->eor; /* enqueue transaction to commit queue */ list_add_tail(&tblk->cqueue, &log->cqueue); LOGGC_UNLOCK(log); } jfs_info("lmWriteRecord: lrd:0x%04x bp:0x%p pn:%d eor:0x%x", le16_to_cpu(lrd->type), log->bp, log->page, dstoffset); /* page not full ? */ if (dstoffset < LOGPSIZE - LOGPTLRSIZE) return lsn; pageFull: /* page become full: move on to next page */ lmNextPage(log); bp = (struct lbuf *) log->bp; lp = (struct logpage *) bp->l_ldata; dstoffset = LOGPHDRSIZE; src += nbytes; } return lsn; } /* * NAME: lmNextPage() * * FUNCTION: write current page and allocate next page. * * PARAMETER: log * * RETURN: 0 * * serialization: LOG_LOCK() held on entry/exit */ static int lmNextPage(struct jfs_log * log) { struct logpage *lp; int lspn; /* log sequence page number */ int pn; /* current page number */ struct lbuf *bp; struct lbuf *nextbp; struct tblock *tblk; /* get current log page number and log sequence page number */ pn = log->page; bp = log->bp; lp = (struct logpage *) bp->l_ldata; lspn = le32_to_cpu(lp->h.page); LOGGC_LOCK(log); /* * write or queue the full page at the tail of write queue */ /* get the tail tblk on commit queue */ if (list_empty(&log->cqueue)) tblk = NULL; else tblk = list_entry(log->cqueue.prev, struct tblock, cqueue); /* every tblk who has COMMIT record on the current page, * and has not been committed, must be on commit queue * since tblk is queued at commit queueu at the time * of writing its COMMIT record on the page before * page becomes full (even though the tblk thread * who wrote COMMIT record may have been suspended * currently); */ /* is page bound with outstanding tail tblk ? */ if (tblk && tblk->pn == pn) { /* mark tblk for end-of-page */ tblk->flag |= tblkGC_EOP; if (log->cflag & logGC_PAGEOUT) { /* if page is not already on write queue, * just enqueue (no lbmWRITE to prevent redrive) * buffer to wqueue to ensure correct serial order * of the pages since log pages will be added * continuously */ if (bp->l_wqnext == NULL) lbmWrite(log, bp, 0, 0); } else { /* * No current GC leader, initiate group commit */ log->cflag |= logGC_PAGEOUT; lmGCwrite(log, 0); } } /* page is not bound with outstanding tblk: * init write or mark it to be redriven (lbmWRITE) */ else { /* finalize the page */ bp->l_ceor = bp->l_eor; lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor); lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE, 0); } LOGGC_UNLOCK(log); /* * allocate/initialize next page */ /* if log wraps, the first data page of log is 2 * (0 never used, 1 is superblock). */ log->page = (pn == log->size - 1) ? 2 : pn + 1; log->eor = LOGPHDRSIZE; /* ? valid page empty/full at logRedo() */ /* allocate/initialize next log page buffer */ nextbp = lbmAllocate(log, log->page); nextbp->l_eor = log->eor; log->bp = nextbp; /* initialize next log page */ lp = (struct logpage *) nextbp->l_ldata; lp->h.page = lp->t.page = cpu_to_le32(lspn + 1); lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE); return 0; } /* * NAME: lmGroupCommit() * * FUNCTION: group commit * initiate pageout of the pages with COMMIT in the order of * page number - redrive pageout of the page at the head of * pageout queue until full page has been written. * * RETURN: * * NOTE: * LOGGC_LOCK serializes log group commit queue, and * transaction blocks on the commit queue. * N.B. LOG_LOCK is NOT held during lmGroupCommit(). */ int lmGroupCommit(struct jfs_log * log, struct tblock * tblk) { int rc = 0; LOGGC_LOCK(log); /* group committed already ? */ if (tblk->flag & tblkGC_COMMITTED) { if (tblk->flag & tblkGC_ERROR) rc = -EIO; LOGGC_UNLOCK(log); return rc; } jfs_info("lmGroup Commit: tblk = 0x%p, gcrtc = %d", tblk, log->gcrtc); if (tblk->xflag & COMMIT_LAZY) tblk->flag |= tblkGC_LAZY; if ((!(log->cflag & logGC_PAGEOUT)) && (!list_empty(&log->cqueue)) && (!(tblk->xflag & COMMIT_LAZY) || test_bit(log_FLUSH, &log->flag) || jfs_tlocks_low)) { /* * No pageout in progress * * start group commit as its group leader. */ log->cflag |= logGC_PAGEOUT; lmGCwrite(log, 0); } if (tblk->xflag & COMMIT_LAZY) { /* * Lazy transactions can leave now */ LOGGC_UNLOCK(log); return 0; } /* lmGCwrite gives up LOGGC_LOCK, check again */ if (tblk->flag & tblkGC_COMMITTED) { if (tblk->flag & tblkGC_ERROR) rc = -EIO; LOGGC_UNLOCK(log); return rc; } /* upcount transaction waiting for completion */ log->gcrtc++; tblk->flag |= tblkGC_READY; __SLEEP_COND(tblk->gcwait, (tblk->flag & tblkGC_COMMITTED), LOGGC_LOCK(log), LOGGC_UNLOCK(log)); /* removed from commit queue */ if (tblk->flag & tblkGC_ERROR) rc = -EIO; LOGGC_UNLOCK(log); return rc; } /* * NAME: lmGCwrite() * * FUNCTION: group commit write * initiate write of log page, building a group of all transactions * with commit records on that page. * * RETURN: None * * NOTE: * LOGGC_LOCK must be held by caller. * N.B. LOG_LOCK is NOT held during lmGroupCommit(). */ static void lmGCwrite(struct jfs_log * log, int cant_write) { struct lbuf *bp; struct logpage *lp; int gcpn; /* group commit page number */ struct tblock *tblk; struct tblock *xtblk = NULL; /* * build the commit group of a log page * * scan commit queue and make a commit group of all * transactions with COMMIT records on the same log page. */ /* get the head tblk on the commit queue */ gcpn = list_entry(log->cqueue.next, struct tblock, cqueue)->pn; list_for_each_entry(tblk, &log->cqueue, cqueue) { if (tblk->pn != gcpn) break; xtblk = tblk; /* state transition: (QUEUE, READY) -> COMMIT */ tblk->flag |= tblkGC_COMMIT; } tblk = xtblk; /* last tblk of the page */ /* * pageout to commit transactions on the log page. */ bp = (struct lbuf *) tblk->bp; lp = (struct logpage *) bp->l_ldata; /* is page already full ? */ if (tblk->flag & tblkGC_EOP) { /* mark page to free at end of group commit of the page */ tblk->flag &= ~tblkGC_EOP; tblk->flag |= tblkGC_FREE; bp->l_ceor = bp->l_eor; lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor); lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmGC, cant_write); INCREMENT(lmStat.full_page); } /* page is not yet full */ else { bp->l_ceor = tblk->eor; /* ? bp->l_ceor = bp->l_eor; */ lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor); lbmWrite(log, bp, lbmWRITE | lbmGC, cant_write); INCREMENT(lmStat.partial_page); } } /* * NAME: lmPostGC() * * FUNCTION: group commit post-processing * Processes transactions after their commit records have been written * to disk, redriving log I/O if necessary. * * RETURN: None * * NOTE: * This routine is called a interrupt time by lbmIODone */ static void lmPostGC(struct lbuf * bp) { unsigned long flags; struct jfs_log *log = bp->l_log; struct logpage *lp; struct tblock *tblk, *temp; //LOGGC_LOCK(log); spin_lock_irqsave(&log->gclock, flags); /* * current pageout of group commit completed. * * remove/wakeup transactions from commit queue who were * group committed with the current log page */ list_for_each_entry_safe(tblk, temp, &log->cqueue, cqueue) { if (!(tblk->flag & tblkGC_COMMIT)) break; /* if transaction was marked GC_COMMIT then * it has been shipped in the current pageout * and made it to disk - it is committed. */ if (bp->l_flag & lbmERROR) tblk->flag |= tblkGC_ERROR; /* remove it from the commit queue */ list_del(&tblk->cqueue); tblk->flag &= ~tblkGC_QUEUE; if (tblk == log->flush_tblk) { /* we can stop flushing the log now */ clear_bit(log_FLUSH, &log->flag); log->flush_tblk = NULL; } jfs_info("lmPostGC: tblk = 0x%p, flag = 0x%x", tblk, tblk->flag); if (!(tblk->xflag & COMMIT_FORCE)) /* * Hand tblk over to lazy commit thread */ txLazyUnlock(tblk); else { /* state transition: COMMIT -> COMMITTED */ tblk->flag |= tblkGC_COMMITTED; if (tblk->flag & tblkGC_READY) log->gcrtc--; LOGGC_WAKEUP(tblk); } /* was page full before pageout ? * (and this is the last tblk bound with the page) */ if (tblk->flag & tblkGC_FREE) lbmFree(bp); /* did page become full after pageout ? * (and this is the last tblk bound with the page) */ else if (tblk->flag & tblkGC_EOP) { /* finalize the page */ lp = (struct logpage *) bp->l_ldata; bp->l_ceor = bp->l_eor; lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor); jfs_info("lmPostGC: calling lbmWrite"); lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE, 1); } } /* are there any transactions who have entered lnGroupCommit() * (whose COMMITs are after that of the last log page written. * They are waiting for new group commit (above at (SLEEP 1)) * or lazy transactions are on a full (queued) log page, * select the latest ready transaction as new group leader and * wake her up to lead her group. */ if ((!list_empty(&log->cqueue)) && ((log->gcrtc > 0) || (tblk->bp->l_wqnext != NULL) || test_bit(log_FLUSH, &log->flag) || jfs_tlocks_low)) /* * Call lmGCwrite with new group leader */ lmGCwrite(log, 1); /* no transaction are ready yet (transactions are only just * queued (GC_QUEUE) and not entered for group commit yet). * the first transaction entering group commit * will elect herself as new group leader. */ else log->cflag &= ~logGC_PAGEOUT; //LOGGC_UNLOCK(log); spin_unlock_irqrestore(&log->gclock, flags); return; } /* * NAME: lmLogSync() * * FUNCTION: write log SYNCPT record for specified log * if new sync address is available * (normally the case if sync() is executed by back-ground * process). * calculate new value of i_nextsync which determines when * this code is called again. * * PARAMETERS: log - log structure * hard_sync - 1 to force all metadata to be written * * RETURN: 0 * * serialization: LOG_LOCK() held on entry/exit */ static int lmLogSync(struct jfs_log * log, int hard_sync) { int logsize; int written; /* written since last syncpt */ int free; /* free space left available */ int delta; /* additional delta to write normally */ int more; /* additional write granted */ struct lrd lrd; int lsn; struct logsyncblk *lp; unsigned long flags; /* push dirty metapages out to disk */ if (hard_sync) write_special_inodes(log, filemap_fdatawrite); else write_special_inodes(log, filemap_flush); /* * forward syncpt */ /* if last sync is same as last syncpt, * invoke sync point forward processing to update sync. */ if (log->sync == log->syncpt) { LOGSYNC_LOCK(log, flags); if (list_empty(&log->synclist)) log->sync = log->lsn; else { lp = list_entry(log->synclist.next, struct logsyncblk, synclist); log->sync = lp->lsn; } LOGSYNC_UNLOCK(log, flags); } /* if sync is different from last syncpt, * write a SYNCPT record with syncpt = sync. * reset syncpt = sync */ if (log->sync != log->syncpt) { lrd.logtid = 0; lrd.backchain = 0; lrd.type = cpu_to_le16(LOG_SYNCPT); lrd.length = 0; lrd.log.syncpt.sync = cpu_to_le32(log->sync); lsn = lmWriteRecord(log, NULL, &lrd, NULL); log->syncpt = log->sync; } else lsn = log->lsn; /* * setup next syncpt trigger (SWAG) */ logsize = log->logsize; logdiff(written, lsn, log); free = logsize - written; delta = LOGSYNC_DELTA(logsize); more = min(free / 2, delta); if (more < 2 * LOGPSIZE) { jfs_warn("\n ... Log Wrap ... Log Wrap ... Log Wrap ...\n"); /* * log wrapping * * option 1 - panic ? No.! * option 2 - shutdown file systems * associated with log ? * option 3 - extend log ? * option 4 - second chance * * mark log wrapped, and continue. * when all active transactions are completed, * mark log valid for recovery. * if crashed during invalid state, log state * implies invalid log, forcing fsck(). */ /* mark log state log wrap in log superblock */ /* log->state = LOGWRAP; */ /* reset sync point computation */ log->syncpt = log->sync = lsn; log->nextsync = delta; } else /* next syncpt trigger = written + more */ log->nextsync = written + more; /* if number of bytes written from last sync point is more * than 1/4 of the log size, stop new transactions from * starting until all current transactions are completed * by setting syncbarrier flag. */ if (!test_bit(log_SYNCBARRIER, &log->flag) && (written > LOGSYNC_BARRIER(logsize)) && log->active) { set_bit(log_SYNCBARRIER, &log->flag); jfs_info("log barrier on: lsn=0x%x syncpt=0x%x", lsn, log->syncpt); /* * We may have to initiate group commit */ jfs_flush_journal(log, 0); } return lsn; } /* * NAME: jfs_syncpt * * FUNCTION: write log SYNCPT record for specified log * * PARAMETERS: log - log structure * hard_sync - set to 1 to force metadata to be written */ void jfs_syncpt(struct jfs_log *log, int hard_sync) { LOG_LOCK(log); if (!test_bit(log_QUIESCE, &log->flag)) lmLogSync(log, hard_sync); LOG_UNLOCK(log); } /* * NAME: lmLogOpen() * * FUNCTION: open the log on first open; * insert filesystem in the active list of the log. * * PARAMETER: ipmnt - file system mount inode * iplog - log inode (out) * * RETURN: * * serialization: */ int lmLogOpen(struct super_block *sb) { int rc; struct file *bdev_file; struct jfs_log *log; struct jfs_sb_info *sbi = JFS_SBI(sb); if (sbi->flag & JFS_NOINTEGRITY) return open_dummy_log(sb); if (sbi->mntflag & JFS_INLINELOG) return open_inline_log(sb); mutex_lock(&jfs_log_mutex); list_for_each_entry(log, &jfs_external_logs, journal_list) { if (file_bdev(log->bdev_file)->bd_dev == sbi->logdev) { if (!uuid_equal(&log->uuid, &sbi->loguuid)) { jfs_warn("wrong uuid on JFS journal"); mutex_unlock(&jfs_log_mutex); return -EINVAL; } /* * add file system to log active file system list */ if ((rc = lmLogFileSystem(log, sbi, 1))) { mutex_unlock(&jfs_log_mutex); return rc; } goto journal_found; } } if (!(log = kzalloc(sizeof(struct jfs_log), GFP_KERNEL))) { mutex_unlock(&jfs_log_mutex); return -ENOMEM; } INIT_LIST_HEAD(&log->sb_list); init_waitqueue_head(&log->syncwait); /* * external log as separate logical volume * * file systems to log may have n-to-1 relationship; */ bdev_file = bdev_file_open_by_dev(sbi->logdev, BLK_OPEN_READ | BLK_OPEN_WRITE, log, NULL); if (IS_ERR(bdev_file)) { rc = PTR_ERR(bdev_file); goto free; } log->bdev_file = bdev_file; uuid_copy(&log->uuid, &sbi->loguuid); /* * initialize log: */ if ((rc = lmLogInit(log))) goto close; list_add(&log->journal_list, &jfs_external_logs); /* * add file system to log active file system list */ if ((rc = lmLogFileSystem(log, sbi, 1))) goto shutdown; journal_found: LOG_LOCK(log); list_add(&sbi->log_list, &log->sb_list); sbi->log = log; LOG_UNLOCK(log); mutex_unlock(&jfs_log_mutex); return 0; /* * unwind on error */ shutdown: /* unwind lbmLogInit() */ list_del(&log->journal_list); lbmLogShutdown(log); close: /* close external log device */ bdev_fput(bdev_file); free: /* free log descriptor */ mutex_unlock(&jfs_log_mutex); kfree(log); jfs_warn("lmLogOpen: exit(%d)", rc); return rc; } static int open_inline_log(struct super_block *sb) { struct jfs_log *log; int rc; if (!(log = kzalloc(sizeof(struct jfs_log), GFP_KERNEL))) return -ENOMEM; INIT_LIST_HEAD(&log->sb_list); init_waitqueue_head(&log->syncwait); set_bit(log_INLINELOG, &log->flag); log->bdev_file = sb->s_bdev_file; log->base = addressPXD(&JFS_SBI(sb)->logpxd); log->size = lengthPXD(&JFS_SBI(sb)->logpxd) >> (L2LOGPSIZE - sb->s_blocksize_bits); log->l2bsize = sb->s_blocksize_bits; ASSERT(L2LOGPSIZE >= sb->s_blocksize_bits); /* * initialize log. */ if ((rc = lmLogInit(log))) { kfree(log); jfs_warn("lmLogOpen: exit(%d)", rc); return rc; } list_add(&JFS_SBI(sb)->log_list, &log->sb_list); JFS_SBI(sb)->log = log; return rc; } static int open_dummy_log(struct super_block *sb) { int rc; mutex_lock(&jfs_log_mutex); if (!dummy_log) { dummy_log = kzalloc(sizeof(struct jfs_log), GFP_KERNEL); if (!dummy_log) { mutex_unlock(&jfs_log_mutex); return -ENOMEM; } INIT_LIST_HEAD(&dummy_log->sb_list); init_waitqueue_head(&dummy_log->syncwait); dummy_log->no_integrity = 1; /* Make up some stuff */ dummy_log->size = 1024; rc = lmLogInit(dummy_log); if (rc) { kfree(dummy_log); dummy_log = NULL; mutex_unlock(&jfs_log_mutex); return rc; } } LOG_LOCK(dummy_log); list_add(&JFS_SBI(sb)->log_list, &dummy_log->sb_list); JFS_SBI(sb)->log = dummy_log; LOG_UNLOCK(dummy_log); mutex_unlock(&jfs_log_mutex); return 0; } /* * NAME: lmLogInit() * * FUNCTION: log initialization at first log open. * * logredo() (or logformat()) should have been run previously. * initialize the log from log superblock. * set the log state in the superblock to LOGMOUNT and * write SYNCPT log record. * * PARAMETER: log - log structure * * RETURN: 0 - if ok * -EINVAL - bad log magic number or superblock dirty * error returned from logwait() * * serialization: single first open thread */ int lmLogInit(struct jfs_log * log) { int rc = 0; struct lrd lrd; struct logsuper *logsuper; struct lbuf *bpsuper; struct lbuf *bp; struct logpage *lp; int lsn = 0; jfs_info("lmLogInit: log:0x%p", log); /* initialize the group commit serialization lock */ LOGGC_LOCK_INIT(log); /* allocate/initialize the log write serialization lock */ LOG_LOCK_INIT(log); LOGSYNC_LOCK_INIT(log); INIT_LIST_HEAD(&log->synclist); INIT_LIST_HEAD(&log->cqueue); log->flush_tblk = NULL; log->count = 0; /* * initialize log i/o */ if ((rc = lbmLogInit(log))) return rc; if (!test_bit(log_INLINELOG, &log->flag)) log->l2bsize = L2LOGPSIZE; /* check for disabled journaling to disk */ if (log->no_integrity) { /* * Journal pages will still be filled. When the time comes * to actually do the I/O, the write is not done, and the * endio routine is called directly. */ bp = lbmAllocate(log , 0); log->bp = bp; bp->l_pn = bp->l_eor = 0; } else { /* * validate log superblock */ if ((rc = lbmRead(log, 1, &bpsuper))) goto errout10; logsuper = (struct logsuper *) bpsuper->l_ldata; if (logsuper->magic != cpu_to_le32(LOGMAGIC)) { jfs_warn("*** Log Format Error ! ***"); rc = -EINVAL; goto errout20; } /* logredo() should have been run successfully. */ if (logsuper->state != cpu_to_le32(LOGREDONE)) { jfs_warn("*** Log Is Dirty ! ***"); rc = -EINVAL; goto errout20; } /* initialize log from log superblock */ if (test_bit(log_INLINELOG,&log->flag)) { if (log->size != le32_to_cpu(logsuper->size)) { rc = -EINVAL; goto errout20; } jfs_info("lmLogInit: inline log:0x%p base:0x%Lx size:0x%x", log, (unsigned long long)log->base, log->size); } else { if (!uuid_equal(&logsuper->uuid, &log->uuid)) { jfs_warn("wrong uuid on JFS log device"); rc = -EINVAL; goto errout20; } log->size = le32_to_cpu(logsuper->size); log->l2bsize = le32_to_cpu(logsuper->l2bsize); jfs_info("lmLogInit: external log:0x%p base:0x%Lx size:0x%x", log, (unsigned long long)log->base, log->size); } log->page = le32_to_cpu(logsuper->end) / LOGPSIZE; log->eor = le32_to_cpu(logsuper->end) - (LOGPSIZE * log->page); /* * initialize for log append write mode */ /* establish current/end-of-log page/buffer */ if ((rc = lbmRead(log, log->page, &bp))) goto errout20; lp = (struct logpage *) bp->l_ldata; jfs_info("lmLogInit: lsn:0x%x page:%d eor:%d:%d", le32_to_cpu(logsuper->end), log->page, log->eor, le16_to_cpu(lp->h.eor)); log->bp = bp; bp->l_pn = log->page; bp->l_eor = log->eor; /* if current page is full, move on to next page */ if (log->eor >= LOGPSIZE - LOGPTLRSIZE) lmNextPage(log); /* * initialize log syncpoint */ /* * write the first SYNCPT record with syncpoint = 0 * (i.e., log redo up to HERE !); * remove current page from lbm write queue at end of pageout * (to write log superblock update), but do not release to * freelist; */ lrd.logtid = 0; lrd.backchain = 0; lrd.type = cpu_to_le16(LOG_SYNCPT); lrd.length = 0; lrd.log.syncpt.sync = 0; lsn = lmWriteRecord(log, NULL, &lrd, NULL); bp = log->bp; bp->l_ceor = bp->l_eor; lp = (struct logpage *) bp->l_ldata; lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor); lbmWrite(log, bp, lbmWRITE | lbmSYNC, 0); if ((rc = lbmIOWait(bp, 0))) goto errout30; /* * update/write superblock */ logsuper->state = cpu_to_le32(LOGMOUNT); log->serial = le32_to_cpu(logsuper->serial) + 1; logsuper->serial = cpu_to_le32(log->serial); lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC); if ((rc = lbmIOWait(bpsuper, lbmFREE))) goto errout30; } /* initialize logsync parameters */ log->logsize = (log->size - 2) << L2LOGPSIZE; log->lsn = lsn; log->syncpt = lsn; log->sync = log->syncpt; log->nextsync = LOGSYNC_DELTA(log->logsize); jfs_info("lmLogInit: lsn:0x%x syncpt:0x%x sync:0x%x", log->lsn, log->syncpt, log->sync); /* * initialize for lazy/group commit */ log->clsn = lsn; return 0; /* * unwind on error */ errout30: /* release log page */ log->wqueue = NULL; bp->l_wqnext = NULL; lbmFree(bp); errout20: /* release log superblock */ lbmFree(bpsuper); errout10: /* unwind lbmLogInit() */ lbmLogShutdown(log); jfs_warn("lmLogInit: exit(%d)", rc); return rc; } /* * NAME: lmLogClose() * * FUNCTION: remove file system <ipmnt> from active list of log <iplog> * and close it on last close. * * PARAMETER: sb - superblock * * RETURN: errors from subroutines * * serialization: */ int lmLogClose(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); struct jfs_log *log = sbi->log; struct file *bdev_file; int rc = 0; jfs_info("lmLogClose: log:0x%p", log); mutex_lock(&jfs_log_mutex); LOG_LOCK(log); list_del(&sbi->log_list); LOG_UNLOCK(log); sbi->log = NULL; /* * We need to make sure all of the "written" metapages * actually make it to disk */ sync_blockdev(sb->s_bdev); if (test_bit(log_INLINELOG, &log->flag)) { /* * in-line log in host file system */ rc = lmLogShutdown(log); kfree(log); goto out; } if (!log->no_integrity) lmLogFileSystem(log, sbi, 0); if (!list_empty(&log->sb_list)) goto out; /* * TODO: ensure that the dummy_log is in a state to allow * lbmLogShutdown to deallocate all the buffers and call * kfree against dummy_log. For now, leave dummy_log & its * buffers in memory, and resuse if another no-integrity mount * is requested. */ if (log->no_integrity) goto out; /* * external log as separate logical volume */ list_del(&log->journal_list); bdev_file = log->bdev_file; rc = lmLogShutdown(log); bdev_fput(bdev_file); kfree(log); out: mutex_unlock(&jfs_log_mutex); jfs_info("lmLogClose: exit(%d)", rc); return rc; } /* * NAME: jfs_flush_journal() * * FUNCTION: initiate write of any outstanding transactions to the journal * and optionally wait until they are all written to disk * * wait == 0 flush until latest txn is committed, don't wait * wait == 1 flush until latest txn is committed, wait * wait > 1 flush until all txn's are complete, wait */ void jfs_flush_journal(struct jfs_log *log, int wait) { int i; struct tblock *target = NULL; /* jfs_write_inode may call us during read-only mount */ if (!log) return; jfs_info("jfs_flush_journal: log:0x%p wait=%d", log, wait); LOGGC_LOCK(log); if (!list_empty(&log->cqueue)) { /* * This ensures that we will keep writing to the journal as long * as there are unwritten commit records */ target = list_entry(log->cqueue.prev, struct tblock, cqueue); if (test_bit(log_FLUSH, &log->flag)) { /* * We're already flushing. * if flush_tblk is NULL, we are flushing everything, * so leave it that way. Otherwise, update it to the * latest transaction */ if (log->flush_tblk) log->flush_tblk = target; } else { /* Only flush until latest transaction is committed */ log->flush_tblk = target; set_bit(log_FLUSH, &log->flag); /* * Initiate I/O on outstanding transactions */ if (!(log->cflag & logGC_PAGEOUT)) { log->cflag |= logGC_PAGEOUT; lmGCwrite(log, 0); } } } if ((wait > 1) || test_bit(log_SYNCBARRIER, &log->flag)) { /* Flush until all activity complete */ set_bit(log_FLUSH, &log->flag); log->flush_tblk = NULL; } if (wait && target && !(target->flag & tblkGC_COMMITTED)) { DECLARE_WAITQUEUE(__wait, current); add_wait_queue(&target->gcwait, &__wait); set_current_state(TASK_UNINTERRUPTIBLE); LOGGC_UNLOCK(log); schedule(); LOGGC_LOCK(log); remove_wait_queue(&target->gcwait, &__wait); } LOGGC_UNLOCK(log); if (wait < 2) return; write_special_inodes(log, filemap_fdatawrite); /* * If there was recent activity, we may need to wait * for the lazycommit thread to catch up */ if ((!list_empty(&log->cqueue)) || !list_empty(&log->synclist)) { for (i = 0; i < 200; i++) { /* Too much? */ msleep(250); write_special_inodes(log, filemap_fdatawrite); if (list_empty(&log->cqueue) && list_empty(&log->synclist)) break; } } assert(list_empty(&log->cqueue)); #ifdef CONFIG_JFS_DEBUG if (!list_empty(&log->synclist)) { struct logsyncblk *lp; printk(KERN_ERR "jfs_flush_journal: synclist not empty\n"); list_for_each_entry(lp, &log->synclist, synclist) { if (lp->xflag & COMMIT_PAGE) { struct metapage *mp = (struct metapage *)lp; print_hex_dump(KERN_ERR, "metapage: ", DUMP_PREFIX_ADDRESS, 16, 4, mp, sizeof(struct metapage), 0); print_hex_dump(KERN_ERR, "page: ", DUMP_PREFIX_ADDRESS, 16, sizeof(long), mp->folio, sizeof(struct page), 0); } else print_hex_dump(KERN_ERR, "tblock:", DUMP_PREFIX_ADDRESS, 16, 4, lp, sizeof(struct tblock), 0); } } #else WARN_ON(!list_empty(&log->synclist)); #endif clear_bit(log_FLUSH, &log->flag); } /* * NAME: lmLogShutdown() * * FUNCTION: log shutdown at last LogClose(). * * write log syncpt record. * update super block to set redone flag to 0. * * PARAMETER: log - log inode * * RETURN: 0 - success * * serialization: single last close thread */ int lmLogShutdown(struct jfs_log * log) { int rc; struct lrd lrd; int lsn; struct logsuper *logsuper; struct lbuf *bpsuper; struct lbuf *bp; struct logpage *lp; jfs_info("lmLogShutdown: log:0x%p", log); jfs_flush_journal(log, 2); /* * write the last SYNCPT record with syncpoint = 0 * (i.e., log redo up to HERE !) */ lrd.logtid = 0; lrd.backchain = 0; lrd.type = cpu_to_le16(LOG_SYNCPT); lrd.length = 0; lrd.log.syncpt.sync = 0; lsn = lmWriteRecord(log, NULL, &lrd, NULL); bp = log->bp; lp = (struct logpage *) bp->l_ldata; lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor); lbmWrite(log, log->bp, lbmWRITE | lbmRELEASE | lbmSYNC, 0); lbmIOWait(log->bp, lbmFREE); log->bp = NULL; /* * synchronous update log superblock * mark log state as shutdown cleanly * (i.e., Log does not need to be replayed). */ if ((rc = lbmRead(log, 1, &bpsuper))) goto out; logsuper = (struct logsuper *) bpsuper->l_ldata; logsuper->state = cpu_to_le32(LOGREDONE); logsuper->end = cpu_to_le32(lsn); lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC); rc = lbmIOWait(bpsuper, lbmFREE); jfs_info("lmLogShutdown: lsn:0x%x page:%d eor:%d", lsn, log->page, log->eor); out: /* * shutdown per log i/o */ lbmLogShutdown(log); if (rc) { jfs_warn("lmLogShutdown: exit(%d)", rc); } return rc; } /* * NAME: lmLogFileSystem() * * FUNCTION: insert (<activate> = true)/remove (<activate> = false) * file system into/from log active file system list. * * PARAMETE: log - pointer to logs inode. * fsdev - kdev_t of filesystem. * serial - pointer to returned log serial number * activate - insert/remove device from active list. * * RETURN: 0 - success * errors returned by vms_iowait(). */ static int lmLogFileSystem(struct jfs_log * log, struct jfs_sb_info *sbi, int activate) { int rc = 0; int i; struct logsuper *logsuper; struct lbuf *bpsuper; uuid_t *uuid = &sbi->uuid; /* * insert/remove file system device to log active file system list. */ if ((rc = lbmRead(log, 1, &bpsuper))) return rc; logsuper = (struct logsuper *) bpsuper->l_ldata; if (activate) { for (i = 0; i < MAX_ACTIVE; i++) if (uuid_is_null(&logsuper->active[i].uuid)) { uuid_copy(&logsuper->active[i].uuid, uuid); sbi->aggregate = i; break; } if (i == MAX_ACTIVE) { jfs_warn("Too many file systems sharing journal!"); lbmFree(bpsuper); return -EMFILE; /* Is there a better rc? */ } } else { for (i = 0; i < MAX_ACTIVE; i++) if (uuid_equal(&logsuper->active[i].uuid, uuid)) { uuid_copy(&logsuper->active[i].uuid, &uuid_null); break; } if (i == MAX_ACTIVE) { jfs_warn("Somebody stomped on the journal!"); lbmFree(bpsuper); return -EIO; } } /* * synchronous write log superblock: * * write sidestream bypassing write queue: * at file system mount, log super block is updated for * activation of the file system before any log record * (MOUNT record) of the file system, and at file system * unmount, all meta data for the file system has been * flushed before log super block is updated for deactivation * of the file system. */ lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC); rc = lbmIOWait(bpsuper, lbmFREE); return rc; } /* * log buffer manager (lbm) * ------------------------ * * special purpose buffer manager supporting log i/o requirements. * * per log write queue: * log pageout occurs in serial order by fifo write queue and * restricting to a single i/o in pregress at any one time. * a circular singly-linked list * (log->wrqueue points to the tail, and buffers are linked via * bp->wrqueue field), and * maintains log page in pageout ot waiting for pageout in serial pageout. */ /* * lbmLogInit() * * initialize per log I/O setup at lmLogInit() */ static int lbmLogInit(struct jfs_log * log) { /* log inode */ int i; struct lbuf *lbuf; jfs_info("lbmLogInit: log:0x%p", log); /* initialize current buffer cursor */ log->bp = NULL; /* initialize log device write queue */ log->wqueue = NULL; /* * Each log has its own buffer pages allocated to it. These are * not managed by the page cache. This ensures that a transaction * writing to the log does not block trying to allocate a page from * the page cache (for the log). This would be bad, since page * allocation waits on the kswapd thread that may be committing inodes * which would cause log activity. Was that clear? I'm trying to * avoid deadlock here. */ init_waitqueue_head(&log->free_wait); log->lbuf_free = NULL; for (i = 0; i < LOGPAGES;) { char *buffer; uint offset; struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) goto error; buffer = page_address(page); for (offset = 0; offset < PAGE_SIZE; offset += LOGPSIZE) { lbuf = kmalloc(sizeof(struct lbuf), GFP_KERNEL); if (lbuf == NULL) { if (offset == 0) __free_page(page); goto error; } if (offset) /* we already have one reference */ get_page(page); lbuf->l_offset = offset; lbuf->l_ldata = buffer + offset; lbuf->l_page = page; lbuf->l_log = log; init_waitqueue_head(&lbuf->l_ioevent); lbuf->l_freelist = log->lbuf_free; log->lbuf_free = lbuf; i++; } } return (0); error: lbmLogShutdown(log); return -ENOMEM; } /* * lbmLogShutdown() * * finalize per log I/O setup at lmLogShutdown() */ static void lbmLogShutdown(struct jfs_log * log) { struct lbuf *lbuf; jfs_info("lbmLogShutdown: log:0x%p", log); lbuf = log->lbuf_free; while (lbuf) { struct lbuf *next = lbuf->l_freelist; __free_page(lbuf->l_page); kfree(lbuf); lbuf = next; } } /* * lbmAllocate() * * allocate an empty log buffer */ static struct lbuf *lbmAllocate(struct jfs_log * log, int pn) { struct lbuf *bp; unsigned long flags; /* * recycle from log buffer freelist if any */ LCACHE_LOCK(flags); LCACHE_SLEEP_COND(log->free_wait, (bp = log->lbuf_free), flags); log->lbuf_free = bp->l_freelist; LCACHE_UNLOCK(flags); bp->l_flag = 0; bp->l_wqnext = NULL; bp->l_freelist = NULL; bp->l_pn = pn; bp->l_blkno = log->base + (pn << (L2LOGPSIZE - log->l2bsize)); bp->l_ceor = 0; return bp; } /* * lbmFree() * * release a log buffer to freelist */ static void lbmFree(struct lbuf * bp) { unsigned long flags; LCACHE_LOCK(flags); lbmfree(bp); LCACHE_UNLOCK(flags); } static void lbmfree(struct lbuf * bp) { struct jfs_log *log = bp->l_log; assert(bp->l_wqnext == NULL); /* * return the buffer to head of freelist */ bp->l_freelist = log->lbuf_free; log->lbuf_free = bp; wake_up(&log->free_wait); return; } /* * NAME: lbmRedrive * * FUNCTION: add a log buffer to the log redrive list * * PARAMETER: * bp - log buffer * * NOTES: * Takes log_redrive_lock. */ static inline void lbmRedrive(struct lbuf *bp) { unsigned long flags; spin_lock_irqsave(&log_redrive_lock, flags); bp->l_redrive_next = log_redrive_list; log_redrive_list = bp; spin_unlock_irqrestore(&log_redrive_lock, flags); wake_up_process(jfsIOthread); } /* * lbmRead() */ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) { struct bio *bio; struct lbuf *bp; /* * allocate a log buffer */ *bpp = bp = lbmAllocate(log, pn); jfs_info("lbmRead: bp:0x%p pn:0x%x", bp, pn); bp->l_flag |= lbmREAD; bio = bio_alloc(file_bdev(log->bdev_file), 1, REQ_OP_READ, GFP_NOFS); bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); __bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); bio->bi_end_io = lbmIODone; bio->bi_private = bp; /*check if journaling to disk has been disabled*/ if (log->no_integrity) { bio->bi_iter.bi_size = 0; lbmIODone(bio); } else { submit_bio(bio); } wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD)); return 0; } /* * lbmWrite() * * buffer at head of pageout queue stays after completion of * partial-page pageout and redriven by explicit initiation of * pageout by caller until full-page pageout is completed and * released. * * device driver i/o done redrives pageout of new buffer at * head of pageout queue when current buffer at head of pageout * queue is released at the completion of its full-page pageout. * * LOGGC_LOCK() serializes lbmWrite() by lmNextPage() and lmGroupCommit(). * LCACHE_LOCK() serializes xflag between lbmWrite() and lbmIODone() */ static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag, int cant_block) { struct lbuf *tail; unsigned long flags; jfs_info("lbmWrite: bp:0x%p flag:0x%x pn:0x%x", bp, flag, bp->l_pn); /* map the logical block address to physical block address */ bp->l_blkno = log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize)); LCACHE_LOCK(flags); /* disable+lock */ /* * initialize buffer for device driver */ bp->l_flag = flag; /* * insert bp at tail of write queue associated with log * * (request is either for bp already/currently at head of queue * or new bp to be inserted at tail) */ tail = log->wqueue; /* is buffer not already on write queue ? */ if (bp->l_wqnext == NULL) { /* insert at tail of wqueue */ if (tail == NULL) { log->wqueue = bp; bp->l_wqnext = bp; } else { log->wqueue = bp; bp->l_wqnext = tail->l_wqnext; tail->l_wqnext = bp; } tail = bp; } /* is buffer at head of wqueue and for write ? */ if ((bp != tail->l_wqnext) || !(flag & lbmWRITE)) { LCACHE_UNLOCK(flags); /* unlock+enable */ return; } LCACHE_UNLOCK(flags); /* unlock+enable */ if (cant_block) lbmRedrive(bp); else if (flag & lbmSYNC) lbmStartIO(bp); else { LOGGC_UNLOCK(log); lbmStartIO(bp); LOGGC_LOCK(log); } } /* * lbmDirectWrite() * * initiate pageout bypassing write queue for sidestream * (e.g., log superblock) write; */ static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag) { jfs_info("lbmDirectWrite: bp:0x%p flag:0x%x pn:0x%x", bp, flag, bp->l_pn); /* * initialize buffer for device driver */ bp->l_flag = flag | lbmDIRECT; /* map the logical block address to physical block address */ bp->l_blkno = log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize)); /* * initiate pageout of the page */ lbmStartIO(bp); } /* * NAME: lbmStartIO() * * FUNCTION: Interface to DD strategy routine * * RETURN: none * * serialization: LCACHE_LOCK() is NOT held during log i/o; */ static void lbmStartIO(struct lbuf * bp) { struct bio *bio; struct jfs_log *log = bp->l_log; struct block_device *bdev = NULL; jfs_info("lbmStartIO"); if (!log->no_integrity) bdev = file_bdev(log->bdev_file); bio = bio_alloc(bdev, 1, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS); bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); __bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); bio->bi_end_io = lbmIODone; bio->bi_private = bp; /* check if journaling to disk has been disabled */ if (log->no_integrity) { bio->bi_iter.bi_size = 0; lbmIODone(bio); } else { submit_bio(bio); INCREMENT(lmStat.submitted); } } /* * lbmIOWait() */ static int lbmIOWait(struct lbuf * bp, int flag) { unsigned long flags; int rc = 0; jfs_info("lbmIOWait1: bp:0x%p flag:0x%x:0x%x", bp, bp->l_flag, flag); LCACHE_LOCK(flags); /* disable+lock */ LCACHE_SLEEP_COND(bp->l_ioevent, (bp->l_flag & lbmDONE), flags); rc = (bp->l_flag & lbmERROR) ? -EIO : 0; if (flag & lbmFREE) lbmfree(bp); LCACHE_UNLOCK(flags); /* unlock+enable */ jfs_info("lbmIOWait2: bp:0x%p flag:0x%x:0x%x", bp, bp->l_flag, flag); return rc; } /* * lbmIODone() * * executed at INTIODONE level */ static void lbmIODone(struct bio *bio) { struct lbuf *bp = bio->bi_private; struct lbuf *nextbp, *tail; struct jfs_log *log; unsigned long flags; /* * get back jfs buffer bound to the i/o buffer */ jfs_info("lbmIODone: bp:0x%p flag:0x%x", bp, bp->l_flag); LCACHE_LOCK(flags); /* disable+lock */ bp->l_flag |= lbmDONE; if (bio->bi_status) { bp->l_flag |= lbmERROR; jfs_err("lbmIODone: I/O error in JFS log"); } bio_put(bio); /* * pagein completion */ if (bp->l_flag & lbmREAD) { bp->l_flag &= ~lbmREAD; LCACHE_UNLOCK(flags); /* unlock+enable */ /* wakeup I/O initiator */ LCACHE_WAKEUP(&bp->l_ioevent); return; } /* * pageout completion * * the bp at the head of write queue has completed pageout. * * if single-commit/full-page pageout, remove the current buffer * from head of pageout queue, and redrive pageout with * the new buffer at head of pageout queue; * otherwise, the partial-page pageout buffer stays at * the head of pageout queue to be redriven for pageout * by lmGroupCommit() until full-page pageout is completed. */ bp->l_flag &= ~lbmWRITE; INCREMENT(lmStat.pagedone); /* update committed lsn */ log = bp->l_log; log->clsn = (bp->l_pn << L2LOGPSIZE) + bp->l_ceor; if (bp->l_flag & lbmDIRECT) { LCACHE_WAKEUP(&bp->l_ioevent); LCACHE_UNLOCK(flags); return; } tail = log->wqueue; /* single element queue */ if (bp == tail) { /* remove head buffer of full-page pageout * from log device write queue */ if (bp->l_flag & lbmRELEASE) { log->wqueue = NULL; bp->l_wqnext = NULL; } } /* multi element queue */ else { /* remove head buffer of full-page pageout * from log device write queue */ if (bp->l_flag & lbmRELEASE) { nextbp = tail->l_wqnext = bp->l_wqnext; bp->l_wqnext = NULL; /* * redrive pageout of next page at head of write queue: * redrive next page without any bound tblk * (i.e., page w/o any COMMIT records), or * first page of new group commit which has been * queued after current page (subsequent pageout * is performed synchronously, except page without * any COMMITs) by lmGroupCommit() as indicated * by lbmWRITE flag; */ if (nextbp->l_flag & lbmWRITE) { /* * We can't do the I/O at interrupt time. * The jfsIO thread can do it */ lbmRedrive(nextbp); } } } /* * synchronous pageout: * * buffer has not necessarily been removed from write queue * (e.g., synchronous write of partial-page with COMMIT): * leave buffer for i/o initiator to dispose */ if (bp->l_flag & lbmSYNC) { LCACHE_UNLOCK(flags); /* unlock+enable */ /* wakeup I/O initiator */ LCACHE_WAKEUP(&bp->l_ioevent); } /* * Group Commit pageout: */ else if (bp->l_flag & lbmGC) { LCACHE_UNLOCK(flags); lmPostGC(bp); } /* * asynchronous pageout: * * buffer must have been removed from write queue: * insert buffer at head of freelist where it can be recycled */ else { assert(bp->l_flag & lbmRELEASE); assert(bp->l_flag & lbmFREE); lbmfree(bp); LCACHE_UNLOCK(flags); /* unlock+enable */ } } int jfsIOWait(void *arg) { struct lbuf *bp; do { spin_lock_irq(&log_redrive_lock); while ((bp = log_redrive_list)) { log_redrive_list = bp->l_redrive_next; bp->l_redrive_next = NULL; spin_unlock_irq(&log_redrive_lock); lbmStartIO(bp); spin_lock_irq(&log_redrive_lock); } if (freezing(current)) { spin_unlock_irq(&log_redrive_lock); try_to_freeze(); } else { set_current_state(TASK_INTERRUPTIBLE); spin_unlock_irq(&log_redrive_lock); schedule(); } } while (!kthread_should_stop()); jfs_info("jfsIOWait being killed!"); return 0; } /* * NAME: lmLogFormat()/jfs_logform() * * FUNCTION: format file system log * * PARAMETERS: * log - volume log * logAddress - start address of log space in FS block * logSize - length of log space in FS block; * * RETURN: 0 - success * -EIO - i/o error * * XXX: We're synchronously writing one page at a time. This needs to * be improved by writing multiple pages at once. */ int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize) { int rc = -EIO; struct jfs_sb_info *sbi; struct logsuper *logsuper; struct logpage *lp; int lspn; /* log sequence page number */ struct lrd *lrd_ptr; int npages = 0; struct lbuf *bp; jfs_info("lmLogFormat: logAddress:%Ld logSize:%d", (long long)logAddress, logSize); sbi = list_entry(log->sb_list.next, struct jfs_sb_info, log_list); /* allocate a log buffer */ bp = lbmAllocate(log, 1); npages = logSize >> sbi->l2nbperpage; /* * log space: * * page 0 - reserved; * page 1 - log superblock; * page 2 - log data page: A SYNC log record is written * into this page at logform time; * pages 3-N - log data page: set to empty log data pages; */ /* * init log superblock: log page 1 */ logsuper = (struct logsuper *) bp->l_ldata; logsuper->magic = cpu_to_le32(LOGMAGIC); logsuper->version = cpu_to_le32(LOGVERSION); logsuper->state = cpu_to_le32(LOGREDONE); logsuper->flag = cpu_to_le32(sbi->mntflag); /* ? */ logsuper->size = cpu_to_le32(npages); logsuper->bsize = cpu_to_le32(sbi->bsize); logsuper->l2bsize = cpu_to_le32(sbi->l2bsize); logsuper->end = cpu_to_le32(2 * LOGPSIZE + LOGPHDRSIZE + LOGRDSIZE); bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT; bp->l_blkno = logAddress + sbi->nbperpage; lbmStartIO(bp); if ((rc = lbmIOWait(bp, 0))) goto exit; /* * init pages 2 to npages-1 as log data pages: * * log page sequence number (lpsn) initialization: * * pn: 0 1 2 3 n-1 * +-----+-----+=====+=====+===.....===+=====+ * lspn: N-1 0 1 N-2 * <--- N page circular file ----> * * the N (= npages-2) data pages of the log is maintained as * a circular file for the log records; * lpsn grows by 1 monotonically as each log page is written * to the circular file of the log; * and setLogpage() will not reset the page number even if * the eor is equal to LOGPHDRSIZE. In order for binary search * still work in find log end process, we have to simulate the * log wrap situation at the log format time. * The 1st log page written will have the highest lpsn. Then * the succeeding log pages will have ascending order of * the lspn starting from 0, ... (N-2) */ lp = (struct logpage *) bp->l_ldata; /* * initialize 1st log page to be written: lpsn = N - 1, * write a SYNCPT log record is written to this page */ lp->h.page = lp->t.page = cpu_to_le32(npages - 3); lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE + LOGRDSIZE); lrd_ptr = (struct lrd *) &lp->data; lrd_ptr->logtid = 0; lrd_ptr->backchain = 0; lrd_ptr->type = cpu_to_le16(LOG_SYNCPT); lrd_ptr->length = 0; lrd_ptr->log.syncpt.sync = 0; bp->l_blkno += sbi->nbperpage; bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT; lbmStartIO(bp); if ((rc = lbmIOWait(bp, 0))) goto exit; /* * initialize succeeding log pages: lpsn = 0, 1, ..., (N-2) */ for (lspn = 0; lspn < npages - 3; lspn++) { lp->h.page = lp->t.page = cpu_to_le32(lspn); lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE); bp->l_blkno += sbi->nbperpage; bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT; lbmStartIO(bp); if ((rc = lbmIOWait(bp, 0))) goto exit; } rc = 0; exit: /* * finalize log */ /* release the buffer */ lbmFree(bp); return rc; } #ifdef CONFIG_JFS_STATISTICS int jfs_lmstats_proc_show(struct seq_file *m, void *v) { seq_printf(m, "JFS Logmgr stats\n" "================\n" "commits = %d\n" "writes submitted = %d\n" "writes completed = %d\n" "full pages submitted = %d\n" "partial pages submitted = %d\n", lmStat.commit, lmStat.submitted, lmStat.pagedone, lmStat.full_page, lmStat.partial_page); return 0; } #endif /* CONFIG_JFS_STATISTICS */
36 4 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2009 IBM Corporation * Author: Mimi Zohar <zohar@us.ibm.com> */ #ifndef _LINUX_INTEGRITY_H #define _LINUX_INTEGRITY_H #include <linux/fs.h> #include <linux/iversion.h> enum integrity_status { INTEGRITY_PASS = 0, INTEGRITY_PASS_IMMUTABLE, INTEGRITY_FAIL, INTEGRITY_FAIL_IMMUTABLE, INTEGRITY_NOLABEL, INTEGRITY_NOXATTRS, INTEGRITY_UNKNOWN, }; #ifdef CONFIG_INTEGRITY extern void __init integrity_load_keys(void); #else static inline void integrity_load_keys(void) { } #endif /* CONFIG_INTEGRITY */ /* An inode's attributes for detection of changes */ struct integrity_inode_attributes { u64 version; /* track inode changes */ unsigned long ino; dev_t dev; }; /* * On stacked filesystems the i_version alone is not enough to detect file data * or metadata change. Additional metadata is required. */ static inline void integrity_inode_attrs_store(struct integrity_inode_attributes *attrs, u64 i_version, const struct inode *inode) { attrs->version = i_version; attrs->dev = inode->i_sb->s_dev; attrs->ino = inode->i_ino; } /* * On stacked filesystems detect whether the inode or its content has changed. */ static inline bool integrity_inode_attrs_changed(const struct integrity_inode_attributes *attrs, const struct inode *inode) { return (inode->i_sb->s_dev != attrs->dev || inode->i_ino != attrs->ino || !inode_eq_iversion(inode, attrs->version)); } #endif /* _LINUX_INTEGRITY_H */
3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 /* * Copyright (c) 2006 Damien Bergamini <damien.bergamini@free.fr> * Copyright (c) 2006 Sam Leffler, Errno Consulting * Copyright (c) 2007 Christoph Hellwig <hch@lst.de> * Copyright (c) 2008-2009 Weongyo Jeong <weongyo@freebsd.org> * Copyright (c) 2012 Pontus Fuchs <pontus.fuchs@gmail.com> * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ /* * This driver is based on the uath driver written by Damien Bergamini for * OpenBSD, who did black-box analysis of the Windows binary driver to find * out how the hardware works. It contains a lot magic numbers because of * that and only has minimal functionality. */ #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/list.h> #include <linux/completion.h> #include <linux/firmware.h> #include <linux/skbuff.h> #include <linux/usb.h> #include <net/mac80211.h> #include "ar5523.h" #include "ar5523_hw.h" /* * Various supported device vendors/products. * UB51: AR5005UG 802.11b/g, UB52: AR5005UX 802.11a/b/g */ static int ar5523_submit_rx_cmd(struct ar5523 *ar); static void ar5523_data_tx_pkt_put(struct ar5523 *ar); static void ar5523_read_reply(struct ar5523 *ar, struct ar5523_cmd_hdr *hdr, struct ar5523_tx_cmd *cmd) { int dlen, olen; __be32 *rp; dlen = be32_to_cpu(hdr->len) - sizeof(*hdr); if (dlen < 0) { WARN_ON(1); goto out; } ar5523_dbg(ar, "Code = %d len = %d\n", be32_to_cpu(hdr->code) & 0xff, dlen); rp = (__be32 *)(hdr + 1); if (dlen >= sizeof(u32)) { olen = be32_to_cpu(rp[0]); dlen -= sizeof(u32); if (olen == 0) { /* convention is 0 =>'s one word */ olen = sizeof(u32); } } else olen = 0; if (cmd->odata) { if (cmd->olen < olen) { ar5523_err(ar, "olen too small %d < %d\n", cmd->olen, olen); cmd->olen = 0; cmd->res = -EOVERFLOW; } else { cmd->olen = olen; memcpy(cmd->odata, &rp[1], olen); cmd->res = 0; } } out: complete(&cmd->done); } static void ar5523_cmd_rx_cb(struct urb *urb) { struct ar5523 *ar = urb->context; struct ar5523_tx_cmd *cmd = &ar->tx_cmd; struct ar5523_cmd_hdr *hdr = ar->rx_cmd_buf; int dlen; u32 code, hdrlen; if (urb->status) { if (urb->status != -ESHUTDOWN) ar5523_err(ar, "RX USB error %d.\n", urb->status); goto skip; } if (urb->actual_length < sizeof(struct ar5523_cmd_hdr)) { ar5523_err(ar, "RX USB too short.\n"); goto skip; } ar5523_dbg(ar, "%s code %02x priv %d\n", __func__, be32_to_cpu(hdr->code) & 0xff, hdr->priv); code = be32_to_cpu(hdr->code); hdrlen = be32_to_cpu(hdr->len); switch (code & 0xff) { default: /* reply to a read command */ if (hdr->priv != AR5523_CMD_ID) { ar5523_err(ar, "Unexpected command id: %02x\n", code & 0xff); goto skip; } ar5523_read_reply(ar, hdr, cmd); break; case WDCMSG_DEVICE_AVAIL: ar5523_dbg(ar, "WDCMSG_DEVICE_AVAIL\n"); cmd->res = 0; cmd->olen = 0; complete(&cmd->done); break; case WDCMSG_SEND_COMPLETE: ar5523_dbg(ar, "WDCMSG_SEND_COMPLETE: %d pending\n", atomic_read(&ar->tx_nr_pending)); if (!test_bit(AR5523_HW_UP, &ar->flags)) ar5523_dbg(ar, "Unexpected WDCMSG_SEND_COMPLETE\n"); else { mod_timer(&ar->tx_wd_timer, jiffies + AR5523_TX_WD_TIMEOUT); ar5523_data_tx_pkt_put(ar); } break; case WDCMSG_TARGET_START: /* This command returns a bogus id so it needs special handling */ dlen = hdrlen - sizeof(*hdr); if (dlen != (int)sizeof(u32)) { ar5523_err(ar, "Invalid reply to WDCMSG_TARGET_START"); return; } if (!cmd->odata) { ar5523_err(ar, "Unexpected WDCMSG_TARGET_START reply"); return; } memcpy(cmd->odata, hdr + 1, sizeof(u32)); cmd->olen = sizeof(u32); cmd->res = 0; complete(&cmd->done); break; case WDCMSG_STATS_UPDATE: ar5523_dbg(ar, "WDCMSG_STATS_UPDATE\n"); break; } skip: ar5523_submit_rx_cmd(ar); } static int ar5523_alloc_rx_cmd(struct ar5523 *ar) { ar->rx_cmd_urb = usb_alloc_urb(0, GFP_KERNEL); if (!ar->rx_cmd_urb) return -ENOMEM; ar->rx_cmd_buf = usb_alloc_coherent(ar->dev, AR5523_MAX_RXCMDSZ, GFP_KERNEL, &ar->rx_cmd_urb->transfer_dma); if (!ar->rx_cmd_buf) { usb_free_urb(ar->rx_cmd_urb); return -ENOMEM; } return 0; } static void ar5523_cancel_rx_cmd(struct ar5523 *ar) { usb_kill_urb(ar->rx_cmd_urb); } static void ar5523_free_rx_cmd(struct ar5523 *ar) { usb_free_coherent(ar->dev, AR5523_MAX_RXCMDSZ, ar->rx_cmd_buf, ar->rx_cmd_urb->transfer_dma); usb_free_urb(ar->rx_cmd_urb); } static int ar5523_submit_rx_cmd(struct ar5523 *ar) { int error; usb_fill_bulk_urb(ar->rx_cmd_urb, ar->dev, ar5523_cmd_rx_pipe(ar->dev), ar->rx_cmd_buf, AR5523_MAX_RXCMDSZ, ar5523_cmd_rx_cb, ar); ar->rx_cmd_urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; error = usb_submit_urb(ar->rx_cmd_urb, GFP_ATOMIC); if (error) { if (error != -ENODEV) ar5523_err(ar, "error %d when submitting rx urb\n", error); return error; } return 0; } /* * Command submitted cb */ static void ar5523_cmd_tx_cb(struct urb *urb) { struct ar5523_tx_cmd *cmd = urb->context; struct ar5523 *ar = cmd->ar; if (urb->status) { ar5523_err(ar, "Failed to TX command. Status = %d\n", urb->status); cmd->res = urb->status; complete(&cmd->done); return; } if (!(cmd->flags & AR5523_CMD_FLAG_READ)) { cmd->res = 0; complete(&cmd->done); } } static void ar5523_cancel_tx_cmd(struct ar5523 *ar) { usb_kill_urb(ar->tx_cmd.urb_tx); } static int ar5523_cmd(struct ar5523 *ar, u32 code, const void *idata, int ilen, void *odata, int olen, int flags) { struct ar5523_cmd_hdr *hdr; struct ar5523_tx_cmd *cmd = &ar->tx_cmd; int xferlen, error; /* always bulk-out a multiple of 4 bytes */ xferlen = (sizeof(struct ar5523_cmd_hdr) + ilen + 3) & ~3; hdr = cmd->buf_tx; memset(hdr, 0, sizeof(struct ar5523_cmd_hdr)); hdr->len = cpu_to_be32(xferlen); hdr->code = cpu_to_be32(code); hdr->priv = AR5523_CMD_ID; if (flags & AR5523_CMD_FLAG_MAGIC) hdr->magic = cpu_to_be32(1 << 24); if (ilen) memcpy(hdr + 1, idata, ilen); cmd->odata = odata; cmd->olen = olen; cmd->flags = flags; ar5523_dbg(ar, "do cmd %02x\n", code); usb_fill_bulk_urb(cmd->urb_tx, ar->dev, ar5523_cmd_tx_pipe(ar->dev), cmd->buf_tx, xferlen, ar5523_cmd_tx_cb, cmd); cmd->urb_tx->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; error = usb_submit_urb(cmd->urb_tx, GFP_KERNEL); if (error) { ar5523_err(ar, "could not send command 0x%x, error=%d\n", code, error); return error; } if (!wait_for_completion_timeout(&cmd->done, 2 * HZ)) { ar5523_cancel_tx_cmd(ar); cmd->odata = NULL; ar5523_err(ar, "timeout waiting for command %02x reply\n", code); cmd->res = -ETIMEDOUT; } return cmd->res; } static int ar5523_cmd_write(struct ar5523 *ar, u32 code, const void *data, int len, int flags) { flags &= ~AR5523_CMD_FLAG_READ; return ar5523_cmd(ar, code, data, len, NULL, 0, flags); } static int ar5523_cmd_read(struct ar5523 *ar, u32 code, const void *idata, int ilen, void *odata, int olen, int flags) { flags |= AR5523_CMD_FLAG_READ; return ar5523_cmd(ar, code, idata, ilen, odata, olen, flags); } static int ar5523_config(struct ar5523 *ar, u32 reg, u32 val) { struct ar5523_write_mac write; int error; write.reg = cpu_to_be32(reg); write.len = cpu_to_be32(0); /* 0 = single write */ *(__be32 *)write.data = cpu_to_be32(val); error = ar5523_cmd_write(ar, WDCMSG_TARGET_SET_CONFIG, &write, 3 * sizeof(u32), 0); if (error != 0) ar5523_err(ar, "could not write register 0x%02x\n", reg); return error; } static int ar5523_config_multi(struct ar5523 *ar, u32 reg, const void *data, int len) { struct ar5523_write_mac write; int error; write.reg = cpu_to_be32(reg); write.len = cpu_to_be32(len); memcpy(write.data, data, len); /* properly handle the case where len is zero (reset) */ error = ar5523_cmd_write(ar, WDCMSG_TARGET_SET_CONFIG, &write, (len == 0) ? sizeof(u32) : 2 * sizeof(u32) + len, 0); if (error != 0) ar5523_err(ar, "could not write %d bytes to register 0x%02x\n", len, reg); return error; } static int ar5523_get_status(struct ar5523 *ar, u32 which, void *odata, int olen) { int error; __be32 which_be; which_be = cpu_to_be32(which); error = ar5523_cmd_read(ar, WDCMSG_TARGET_GET_STATUS, &which_be, sizeof(which_be), odata, olen, AR5523_CMD_FLAG_MAGIC); if (error != 0) ar5523_err(ar, "could not read EEPROM offset 0x%02x\n", which); return error; } static int ar5523_get_capability(struct ar5523 *ar, u32 cap, u32 *val) { int error; __be32 cap_be, val_be; cap_be = cpu_to_be32(cap); error = ar5523_cmd_read(ar, WDCMSG_TARGET_GET_CAPABILITY, &cap_be, sizeof(cap_be), &val_be, sizeof(__be32), AR5523_CMD_FLAG_MAGIC); if (error != 0) { ar5523_err(ar, "could not read capability %u\n", cap); return error; } *val = be32_to_cpu(val_be); return error; } static int ar5523_get_devcap(struct ar5523 *ar) { #define GETCAP(x) do { \ error = ar5523_get_capability(ar, x, &cap); \ if (error != 0) \ return error; \ ar5523_info(ar, "Cap: " \ "%s=0x%08x\n", #x, cap); \ } while (0) int error; u32 cap; /* collect device capabilities */ GETCAP(CAP_TARGET_VERSION); GETCAP(CAP_TARGET_REVISION); GETCAP(CAP_MAC_VERSION); GETCAP(CAP_MAC_REVISION); GETCAP(CAP_PHY_REVISION); GETCAP(CAP_ANALOG_5GHz_REVISION); GETCAP(CAP_ANALOG_2GHz_REVISION); GETCAP(CAP_REG_DOMAIN); GETCAP(CAP_REG_CAP_BITS); GETCAP(CAP_WIRELESS_MODES); GETCAP(CAP_CHAN_SPREAD_SUPPORT); GETCAP(CAP_COMPRESS_SUPPORT); GETCAP(CAP_BURST_SUPPORT); GETCAP(CAP_FAST_FRAMES_SUPPORT); GETCAP(CAP_CHAP_TUNING_SUPPORT); GETCAP(CAP_TURBOG_SUPPORT); GETCAP(CAP_TURBO_PRIME_SUPPORT); GETCAP(CAP_DEVICE_TYPE); GETCAP(CAP_WME_SUPPORT); GETCAP(CAP_TOTAL_QUEUES); GETCAP(CAP_CONNECTION_ID_MAX); GETCAP(CAP_LOW_5GHZ_CHAN); GETCAP(CAP_HIGH_5GHZ_CHAN); GETCAP(CAP_LOW_2GHZ_CHAN); GETCAP(CAP_HIGH_2GHZ_CHAN); GETCAP(CAP_TWICE_ANTENNAGAIN_5G); GETCAP(CAP_TWICE_ANTENNAGAIN_2G); GETCAP(CAP_CIPHER_AES_CCM); GETCAP(CAP_CIPHER_TKIP); GETCAP(CAP_MIC_TKIP); return 0; } static int ar5523_set_ledsteady(struct ar5523 *ar, int lednum, int ledmode) { struct ar5523_cmd_ledsteady led; led.lednum = cpu_to_be32(lednum); led.ledmode = cpu_to_be32(ledmode); ar5523_dbg(ar, "set %s led %s (steady)\n", (lednum == UATH_LED_LINK) ? "link" : "activity", ledmode ? "on" : "off"); return ar5523_cmd_write(ar, WDCMSG_SET_LED_STEADY, &led, sizeof(led), 0); } static int ar5523_set_rxfilter(struct ar5523 *ar, u32 bits, u32 op) { struct ar5523_cmd_rx_filter rxfilter; rxfilter.bits = cpu_to_be32(bits); rxfilter.op = cpu_to_be32(op); ar5523_dbg(ar, "setting Rx filter=0x%x flags=0x%x\n", bits, op); return ar5523_cmd_write(ar, WDCMSG_RX_FILTER, &rxfilter, sizeof(rxfilter), 0); } static int ar5523_reset_tx_queues(struct ar5523 *ar) { __be32 qid = cpu_to_be32(0); ar5523_dbg(ar, "resetting Tx queue\n"); return ar5523_cmd_write(ar, WDCMSG_RELEASE_TX_QUEUE, &qid, sizeof(qid), 0); } static int ar5523_set_chan(struct ar5523 *ar) { struct ieee80211_conf *conf = &ar->hw->conf; struct ar5523_cmd_reset reset; memset(&reset, 0, sizeof(reset)); reset.flags |= cpu_to_be32(UATH_CHAN_2GHZ); reset.flags |= cpu_to_be32(UATH_CHAN_OFDM); reset.freq = cpu_to_be32(conf->chandef.chan->center_freq); reset.maxrdpower = cpu_to_be32(50); /* XXX */ reset.channelchange = cpu_to_be32(1); reset.keeprccontent = cpu_to_be32(0); ar5523_dbg(ar, "set chan flags 0x%x freq %d\n", be32_to_cpu(reset.flags), conf->chandef.chan->center_freq); return ar5523_cmd_write(ar, WDCMSG_RESET, &reset, sizeof(reset), 0); } static int ar5523_queue_init(struct ar5523 *ar) { struct ar5523_cmd_txq_setup qinfo; ar5523_dbg(ar, "setting up Tx queue\n"); qinfo.qid = cpu_to_be32(0); qinfo.len = cpu_to_be32(sizeof(qinfo.attr)); qinfo.attr.priority = cpu_to_be32(0); /* XXX */ qinfo.attr.aifs = cpu_to_be32(3); qinfo.attr.logcwmin = cpu_to_be32(4); qinfo.attr.logcwmax = cpu_to_be32(10); qinfo.attr.bursttime = cpu_to_be32(0); qinfo.attr.mode = cpu_to_be32(0); qinfo.attr.qflags = cpu_to_be32(1); /* XXX? */ return ar5523_cmd_write(ar, WDCMSG_SETUP_TX_QUEUE, &qinfo, sizeof(qinfo), 0); } static int ar5523_switch_chan(struct ar5523 *ar) { int error; error = ar5523_set_chan(ar); if (error) { ar5523_err(ar, "could not set chan, error %d\n", error); goto out_err; } /* reset Tx rings */ error = ar5523_reset_tx_queues(ar); if (error) { ar5523_err(ar, "could not reset Tx queues, error %d\n", error); goto out_err; } /* set Tx rings WME properties */ error = ar5523_queue_init(ar); if (error) ar5523_err(ar, "could not init wme, error %d\n", error); out_err: return error; } static void ar5523_rx_data_put(struct ar5523 *ar, struct ar5523_rx_data *data) { unsigned long flags; spin_lock_irqsave(&ar->rx_data_list_lock, flags); list_move(&data->list, &ar->rx_data_free); spin_unlock_irqrestore(&ar->rx_data_list_lock, flags); } static void ar5523_data_rx_cb(struct urb *urb) { struct ar5523_rx_data *data = urb->context; struct ar5523 *ar = data->ar; struct ar5523_rx_desc *desc; struct ar5523_chunk *chunk; struct ieee80211_hw *hw = ar->hw; struct ieee80211_rx_status *rx_status; u32 rxlen; int usblen = urb->actual_length; int hdrlen, pad; ar5523_dbg(ar, "%s\n", __func__); /* sync/async unlink faults aren't errors */ if (urb->status) { if (urb->status != -ESHUTDOWN) ar5523_err(ar, "%s: USB err: %d\n", __func__, urb->status); goto skip; } if (usblen < AR5523_MIN_RXBUFSZ) { ar5523_err(ar, "RX: wrong xfer size (usblen=%d)\n", usblen); goto skip; } chunk = (struct ar5523_chunk *) data->skb->data; if (((chunk->flags & UATH_CFLAGS_FINAL) == 0) || chunk->seqnum != 0) { ar5523_dbg(ar, "RX: No final flag. s: %d f: %02x l: %d\n", chunk->seqnum, chunk->flags, be16_to_cpu(chunk->length)); goto skip; } /* Rx descriptor is located at the end, 32-bit aligned */ desc = (struct ar5523_rx_desc *) (data->skb->data + usblen - sizeof(struct ar5523_rx_desc)); rxlen = be32_to_cpu(desc->len); if (rxlen > ar->rxbufsz) { ar5523_dbg(ar, "RX: Bad descriptor (len=%d)\n", be32_to_cpu(desc->len)); goto skip; } if (!rxlen) { ar5523_dbg(ar, "RX: rxlen is 0\n"); goto skip; } if (be32_to_cpu(desc->status) != 0) { ar5523_dbg(ar, "Bad RX status (0x%x len = %d). Skip\n", be32_to_cpu(desc->status), be32_to_cpu(desc->len)); goto skip; } skb_reserve(data->skb, sizeof(*chunk)); skb_put(data->skb, rxlen - sizeof(struct ar5523_rx_desc)); hdrlen = ieee80211_get_hdrlen_from_skb(data->skb); if (!IS_ALIGNED(hdrlen, 4)) { ar5523_dbg(ar, "eek, alignment workaround activated\n"); pad = ALIGN(hdrlen, 4) - hdrlen; memmove(data->skb->data + pad, data->skb->data, hdrlen); skb_pull(data->skb, pad); skb_put(data->skb, pad); } rx_status = IEEE80211_SKB_RXCB(data->skb); memset(rx_status, 0, sizeof(*rx_status)); rx_status->freq = be32_to_cpu(desc->channel); rx_status->band = hw->conf.chandef.chan->band; rx_status->signal = -95 + be32_to_cpu(desc->rssi); ieee80211_rx_irqsafe(hw, data->skb); data->skb = NULL; skip: if (data->skb) { dev_kfree_skb_irq(data->skb); data->skb = NULL; } ar5523_rx_data_put(ar, data); if (atomic_inc_return(&ar->rx_data_free_cnt) >= AR5523_RX_DATA_REFILL_COUNT && test_bit(AR5523_HW_UP, &ar->flags)) queue_work(ar->wq, &ar->rx_refill_work); } static void ar5523_rx_refill_work(struct work_struct *work) { struct ar5523 *ar = container_of(work, struct ar5523, rx_refill_work); struct ar5523_rx_data *data; unsigned long flags; int error; ar5523_dbg(ar, "%s\n", __func__); do { spin_lock_irqsave(&ar->rx_data_list_lock, flags); if (!list_empty(&ar->rx_data_free)) data = (struct ar5523_rx_data *) ar->rx_data_free.next; else data = NULL; spin_unlock_irqrestore(&ar->rx_data_list_lock, flags); if (!data) goto done; data->skb = alloc_skb(ar->rxbufsz, GFP_KERNEL); if (!data->skb) { ar5523_err(ar, "could not allocate rx skbuff\n"); return; } usb_fill_bulk_urb(data->urb, ar->dev, ar5523_data_rx_pipe(ar->dev), data->skb->data, ar->rxbufsz, ar5523_data_rx_cb, data); spin_lock_irqsave(&ar->rx_data_list_lock, flags); list_move(&data->list, &ar->rx_data_used); spin_unlock_irqrestore(&ar->rx_data_list_lock, flags); atomic_dec(&ar->rx_data_free_cnt); error = usb_submit_urb(data->urb, GFP_KERNEL); if (error) { kfree_skb(data->skb); if (error != -ENODEV) ar5523_err(ar, "Err sending rx data urb %d\n", error); ar5523_rx_data_put(ar, data); atomic_inc(&ar->rx_data_free_cnt); return; } } while (true); done: return; } static void ar5523_cancel_rx_bufs(struct ar5523 *ar) { struct ar5523_rx_data *data; unsigned long flags; do { spin_lock_irqsave(&ar->rx_data_list_lock, flags); if (!list_empty(&ar->rx_data_used)) data = (struct ar5523_rx_data *) ar->rx_data_used.next; else data = NULL; spin_unlock_irqrestore(&ar->rx_data_list_lock, flags); if (!data) break; usb_kill_urb(data->urb); list_move(&data->list, &ar->rx_data_free); atomic_inc(&ar->rx_data_free_cnt); } while (data); } static void ar5523_free_rx_bufs(struct ar5523 *ar) { struct ar5523_rx_data *data; ar5523_cancel_rx_bufs(ar); while (!list_empty(&ar->rx_data_free)) { data = (struct ar5523_rx_data *) ar->rx_data_free.next; list_del(&data->list); usb_free_urb(data->urb); } } static int ar5523_alloc_rx_bufs(struct ar5523 *ar) { int i; for (i = 0; i < AR5523_RX_DATA_COUNT; i++) { struct ar5523_rx_data *data = &ar->rx_data[i]; data->ar = ar; data->urb = usb_alloc_urb(0, GFP_KERNEL); if (!data->urb) goto err; list_add_tail(&data->list, &ar->rx_data_free); atomic_inc(&ar->rx_data_free_cnt); } return 0; err: ar5523_free_rx_bufs(ar); return -ENOMEM; } static void ar5523_data_tx_pkt_put(struct ar5523 *ar) { atomic_dec(&ar->tx_nr_total); if (!atomic_dec_return(&ar->tx_nr_pending)) { timer_delete(&ar->tx_wd_timer); wake_up(&ar->tx_flush_waitq); } if (atomic_read(&ar->tx_nr_total) < AR5523_TX_DATA_RESTART_COUNT) { ar5523_dbg(ar, "restart tx queue\n"); ieee80211_wake_queues(ar->hw); } } static void ar5523_data_tx_cb(struct urb *urb) { struct sk_buff *skb = urb->context; struct ieee80211_tx_info *txi = IEEE80211_SKB_CB(skb); struct ar5523_tx_data *data = (struct ar5523_tx_data *) txi->driver_data; struct ar5523 *ar = data->ar; unsigned long flags; ar5523_dbg(ar, "data tx urb completed: %d\n", urb->status); spin_lock_irqsave(&ar->tx_data_list_lock, flags); list_del(&data->list); spin_unlock_irqrestore(&ar->tx_data_list_lock, flags); if (urb->status) { ar5523_dbg(ar, "%s: urb status: %d\n", __func__, urb->status); ar5523_data_tx_pkt_put(ar); ieee80211_free_txskb(ar->hw, skb); } else { skb_pull(skb, sizeof(struct ar5523_tx_desc) + sizeof(__be32)); ieee80211_tx_status_irqsafe(ar->hw, skb); } usb_free_urb(urb); } static void ar5523_tx(struct ieee80211_hw *hw, struct ieee80211_tx_control *control, struct sk_buff *skb) { struct ieee80211_tx_info *txi = IEEE80211_SKB_CB(skb); struct ar5523_tx_data *data = (struct ar5523_tx_data *) txi->driver_data; struct ar5523 *ar = hw->priv; unsigned long flags; ar5523_dbg(ar, "tx called\n"); if (atomic_inc_return(&ar->tx_nr_total) >= AR5523_TX_DATA_COUNT) { ar5523_dbg(ar, "tx queue full\n"); ar5523_dbg(ar, "stop queues (tot %d pend %d)\n", atomic_read(&ar->tx_nr_total), atomic_read(&ar->tx_nr_pending)); ieee80211_stop_queues(hw); } spin_lock_irqsave(&ar->tx_data_list_lock, flags); list_add_tail(&data->list, &ar->tx_queue_pending); spin_unlock_irqrestore(&ar->tx_data_list_lock, flags); ieee80211_queue_work(ar->hw, &ar->tx_work); } static void ar5523_tx_work_locked(struct ar5523 *ar) { struct ar5523_tx_data *data; struct ar5523_tx_desc *desc; struct ar5523_chunk *chunk; struct ieee80211_tx_info *txi; struct urb *urb; struct sk_buff *skb; int error = 0, paylen; u32 txqid; unsigned long flags; BUILD_BUG_ON(sizeof(struct ar5523_tx_data) > IEEE80211_TX_INFO_DRIVER_DATA_SIZE); ar5523_dbg(ar, "%s\n", __func__); do { spin_lock_irqsave(&ar->tx_data_list_lock, flags); if (!list_empty(&ar->tx_queue_pending)) { data = (struct ar5523_tx_data *) ar->tx_queue_pending.next; list_del(&data->list); } else data = NULL; spin_unlock_irqrestore(&ar->tx_data_list_lock, flags); if (!data) break; txi = container_of((void *)data, struct ieee80211_tx_info, driver_data); txqid = 0; skb = container_of((void *)txi, struct sk_buff, cb); paylen = skb->len; urb = usb_alloc_urb(0, GFP_KERNEL); if (!urb) { ieee80211_free_txskb(ar->hw, skb); continue; } data->ar = ar; data->urb = urb; desc = skb_push(skb, sizeof(*desc)); chunk = skb_push(skb, sizeof(*chunk)); chunk->seqnum = 0; chunk->flags = UATH_CFLAGS_FINAL; chunk->length = cpu_to_be16(skb->len); desc->msglen = cpu_to_be32(skb->len); desc->msgid = AR5523_DATA_ID; desc->buflen = cpu_to_be32(paylen); desc->type = cpu_to_be32(WDCMSG_SEND); desc->flags = cpu_to_be32(UATH_TX_NOTIFY); if (test_bit(AR5523_CONNECTED, &ar->flags)) desc->connid = cpu_to_be32(AR5523_ID_BSS); else desc->connid = cpu_to_be32(AR5523_ID_BROADCAST); if (txi->flags & IEEE80211_TX_CTL_USE_MINRATE) txqid |= UATH_TXQID_MINRATE; desc->txqid = cpu_to_be32(txqid); urb->transfer_flags = URB_ZERO_PACKET; usb_fill_bulk_urb(urb, ar->dev, ar5523_data_tx_pipe(ar->dev), skb->data, skb->len, ar5523_data_tx_cb, skb); spin_lock_irqsave(&ar->tx_data_list_lock, flags); list_add_tail(&data->list, &ar->tx_queue_submitted); spin_unlock_irqrestore(&ar->tx_data_list_lock, flags); mod_timer(&ar->tx_wd_timer, jiffies + AR5523_TX_WD_TIMEOUT); atomic_inc(&ar->tx_nr_pending); ar5523_dbg(ar, "TX Frame (%d pending)\n", atomic_read(&ar->tx_nr_pending)); error = usb_submit_urb(urb, GFP_KERNEL); if (error) { ar5523_err(ar, "error %d when submitting tx urb\n", error); spin_lock_irqsave(&ar->tx_data_list_lock, flags); list_del(&data->list); spin_unlock_irqrestore(&ar->tx_data_list_lock, flags); atomic_dec(&ar->tx_nr_pending); ar5523_data_tx_pkt_put(ar); usb_free_urb(urb); ieee80211_free_txskb(ar->hw, skb); } } while (true); } static void ar5523_tx_work(struct work_struct *work) { struct ar5523 *ar = container_of(work, struct ar5523, tx_work); ar5523_dbg(ar, "%s\n", __func__); mutex_lock(&ar->mutex); ar5523_tx_work_locked(ar); mutex_unlock(&ar->mutex); } static void ar5523_tx_wd_timer(struct timer_list *t) { struct ar5523 *ar = timer_container_of(ar, t, tx_wd_timer); ar5523_dbg(ar, "TX watchdog timer triggered\n"); ieee80211_queue_work(ar->hw, &ar->tx_wd_work); } static void ar5523_tx_wd_work(struct work_struct *work) { struct ar5523 *ar = container_of(work, struct ar5523, tx_wd_work); /* Occasionally the TX queues stop responding. The only way to * recover seems to be to reset the dongle. */ mutex_lock(&ar->mutex); ar5523_err(ar, "TX queue stuck (tot %d pend %d)\n", atomic_read(&ar->tx_nr_total), atomic_read(&ar->tx_nr_pending)); ar5523_err(ar, "Will restart dongle.\n"); ar5523_cmd_write(ar, WDCMSG_TARGET_RESET, NULL, 0, 0); mutex_unlock(&ar->mutex); } static void ar5523_flush_tx(struct ar5523 *ar) { ar5523_tx_work_locked(ar); /* Don't waste time trying to flush if USB is disconnected */ if (test_bit(AR5523_USB_DISCONNECTED, &ar->flags)) return; if (!wait_event_timeout(ar->tx_flush_waitq, !atomic_read(&ar->tx_nr_pending), AR5523_FLUSH_TIMEOUT)) ar5523_err(ar, "flush timeout (tot %d pend %d)\n", atomic_read(&ar->tx_nr_total), atomic_read(&ar->tx_nr_pending)); } static void ar5523_free_tx_cmd(struct ar5523 *ar) { struct ar5523_tx_cmd *cmd = &ar->tx_cmd; usb_free_coherent(ar->dev, AR5523_MAX_RXCMDSZ, cmd->buf_tx, cmd->urb_tx->transfer_dma); usb_free_urb(cmd->urb_tx); } static int ar5523_alloc_tx_cmd(struct ar5523 *ar) { struct ar5523_tx_cmd *cmd = &ar->tx_cmd; cmd->ar = ar; init_completion(&cmd->done); cmd->urb_tx = usb_alloc_urb(0, GFP_KERNEL); if (!cmd->urb_tx) return -ENOMEM; cmd->buf_tx = usb_alloc_coherent(ar->dev, AR5523_MAX_TXCMDSZ, GFP_KERNEL, &cmd->urb_tx->transfer_dma); if (!cmd->buf_tx) { usb_free_urb(cmd->urb_tx); return -ENOMEM; } return 0; } /* * This function is called periodically (every second) when associated to * query device statistics. */ static void ar5523_stat_work(struct work_struct *work) { struct ar5523 *ar = container_of(work, struct ar5523, stat_work.work); int error; ar5523_dbg(ar, "%s\n", __func__); mutex_lock(&ar->mutex); /* * Send request for statistics asynchronously once a second. This * seems to be important. Throughput is a lot better if this is done. */ error = ar5523_cmd_write(ar, WDCMSG_TARGET_GET_STATS, NULL, 0, 0); if (error) ar5523_err(ar, "could not query stats, error %d\n", error); mutex_unlock(&ar->mutex); ieee80211_queue_delayed_work(ar->hw, &ar->stat_work, HZ); } /* * Interface routines to the mac80211 stack. */ static int ar5523_start(struct ieee80211_hw *hw) { struct ar5523 *ar = hw->priv; int error; __be32 val; ar5523_dbg(ar, "start called\n"); mutex_lock(&ar->mutex); val = cpu_to_be32(0); ar5523_cmd_write(ar, WDCMSG_BIND, &val, sizeof(val), 0); /* set MAC address */ ar5523_config_multi(ar, CFG_MAC_ADDR, &ar->hw->wiphy->perm_addr, ETH_ALEN); /* XXX honor net80211 state */ ar5523_config(ar, CFG_RATE_CONTROL_ENABLE, 0x00000001); ar5523_config(ar, CFG_DIVERSITY_CTL, 0x00000001); ar5523_config(ar, CFG_ABOLT, 0x0000003f); ar5523_config(ar, CFG_WME_ENABLED, 0x00000000); ar5523_config(ar, CFG_SERVICE_TYPE, 1); ar5523_config(ar, CFG_TP_SCALE, 0x00000000); ar5523_config(ar, CFG_TPC_HALF_DBM5, 0x0000003c); ar5523_config(ar, CFG_TPC_HALF_DBM2, 0x0000003c); ar5523_config(ar, CFG_OVERRD_TX_POWER, 0x00000000); ar5523_config(ar, CFG_GMODE_PROTECTION, 0x00000000); ar5523_config(ar, CFG_GMODE_PROTECT_RATE_INDEX, 0x00000003); ar5523_config(ar, CFG_PROTECTION_TYPE, 0x00000000); ar5523_config(ar, CFG_MODE_CTS, 0x00000002); error = ar5523_cmd_read(ar, WDCMSG_TARGET_START, NULL, 0, &val, sizeof(val), AR5523_CMD_FLAG_MAGIC); if (error) { ar5523_dbg(ar, "could not start target, error %d\n", error); goto err; } ar5523_dbg(ar, "WDCMSG_TARGET_START returns handle: 0x%x\n", be32_to_cpu(val)); ar5523_switch_chan(ar); val = cpu_to_be32(TARGET_DEVICE_AWAKE); ar5523_cmd_write(ar, WDCMSG_SET_PWR_MODE, &val, sizeof(val), 0); /* XXX? check */ ar5523_cmd_write(ar, WDCMSG_RESET_KEY_CACHE, NULL, 0, 0); set_bit(AR5523_HW_UP, &ar->flags); queue_work(ar->wq, &ar->rx_refill_work); /* enable Rx */ ar5523_set_rxfilter(ar, 0, UATH_FILTER_OP_INIT); ar5523_set_rxfilter(ar, UATH_FILTER_RX_UCAST | UATH_FILTER_RX_MCAST | UATH_FILTER_RX_BCAST | UATH_FILTER_RX_BEACON, UATH_FILTER_OP_SET); ar5523_set_ledsteady(ar, UATH_LED_ACTIVITY, UATH_LED_ON); ar5523_dbg(ar, "start OK\n"); err: mutex_unlock(&ar->mutex); return error; } static void ar5523_stop(struct ieee80211_hw *hw, bool suspend) { struct ar5523 *ar = hw->priv; ar5523_dbg(ar, "stop called\n"); cancel_delayed_work_sync(&ar->stat_work); mutex_lock(&ar->mutex); clear_bit(AR5523_HW_UP, &ar->flags); ar5523_set_ledsteady(ar, UATH_LED_LINK, UATH_LED_OFF); ar5523_set_ledsteady(ar, UATH_LED_ACTIVITY, UATH_LED_OFF); ar5523_cmd_write(ar, WDCMSG_TARGET_STOP, NULL, 0, 0); timer_delete_sync(&ar->tx_wd_timer); cancel_work_sync(&ar->tx_wd_work); cancel_work_sync(&ar->rx_refill_work); ar5523_cancel_rx_bufs(ar); mutex_unlock(&ar->mutex); } static int ar5523_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, u32 value) { struct ar5523 *ar = hw->priv; int ret; ar5523_dbg(ar, "set_rts_threshold called\n"); mutex_lock(&ar->mutex); ret = ar5523_config(ar, CFG_USER_RTS_THRESHOLD, value); mutex_unlock(&ar->mutex); return ret; } static void ar5523_flush(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u32 queues, bool drop) { struct ar5523 *ar = hw->priv; ar5523_dbg(ar, "flush called\n"); ar5523_flush_tx(ar); } static int ar5523_add_interface(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { struct ar5523 *ar = hw->priv; ar5523_dbg(ar, "add interface called\n"); if (ar->vif) { ar5523_dbg(ar, "invalid add_interface\n"); return -EOPNOTSUPP; } switch (vif->type) { case NL80211_IFTYPE_STATION: ar->vif = vif; break; default: return -EOPNOTSUPP; } return 0; } static void ar5523_remove_interface(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { struct ar5523 *ar = hw->priv; ar5523_dbg(ar, "remove interface called\n"); ar->vif = NULL; } static int ar5523_hwconfig(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct ar5523 *ar = hw->priv; ar5523_dbg(ar, "config called\n"); mutex_lock(&ar->mutex); if (changed & IEEE80211_CONF_CHANGE_CHANNEL) { ar5523_dbg(ar, "Do channel switch\n"); ar5523_flush_tx(ar); ar5523_switch_chan(ar); } mutex_unlock(&ar->mutex); return 0; } static int ar5523_get_wlan_mode(struct ar5523 *ar, struct ieee80211_bss_conf *bss_conf) { struct ieee80211_supported_band *band; int bit; struct ieee80211_sta *sta; u32 sta_rate_set; band = ar->hw->wiphy->bands[ar->hw->conf.chandef.chan->band]; sta = ieee80211_find_sta(ar->vif, bss_conf->bssid); if (!sta) { ar5523_info(ar, "STA not found!\n"); return WLAN_MODE_11b; } sta_rate_set = sta->deflink.supp_rates[ar->hw->conf.chandef.chan->band]; for (bit = 0; bit < band->n_bitrates; bit++) { if (sta_rate_set & 1) { int rate = band->bitrates[bit].bitrate; switch (rate) { case 60: case 90: case 120: case 180: case 240: case 360: case 480: case 540: return WLAN_MODE_11g; } } sta_rate_set >>= 1; } return WLAN_MODE_11b; } static void ar5523_create_rateset(struct ar5523 *ar, struct ieee80211_bss_conf *bss_conf, struct ar5523_cmd_rateset *rs, bool basic) { struct ieee80211_supported_band *band; struct ieee80211_sta *sta; int bit, i = 0; u32 sta_rate_set, basic_rate_set; sta = ieee80211_find_sta(ar->vif, bss_conf->bssid); basic_rate_set = bss_conf->basic_rates; if (!sta) { ar5523_info(ar, "STA not found. Cannot set rates\n"); sta_rate_set = bss_conf->basic_rates; } else sta_rate_set = sta->deflink.supp_rates[ar->hw->conf.chandef.chan->band]; ar5523_dbg(ar, "sta rate_set = %08x\n", sta_rate_set); band = ar->hw->wiphy->bands[ar->hw->conf.chandef.chan->band]; for (bit = 0; bit < band->n_bitrates; bit++) { BUG_ON(i >= AR5523_MAX_NRATES); ar5523_dbg(ar, "Considering rate %d : %d\n", band->bitrates[bit].hw_value, sta_rate_set & 1); if (sta_rate_set & 1) { rs->set[i] = band->bitrates[bit].hw_value; if (basic_rate_set & 1 && basic) rs->set[i] |= 0x80; i++; } sta_rate_set >>= 1; basic_rate_set >>= 1; } rs->length = i; } static int ar5523_set_basic_rates(struct ar5523 *ar, struct ieee80211_bss_conf *bss) { struct ar5523_cmd_rates rates; memset(&rates, 0, sizeof(rates)); rates.connid = cpu_to_be32(2); /* XXX */ rates.size = cpu_to_be32(sizeof(struct ar5523_cmd_rateset)); ar5523_create_rateset(ar, bss, &rates.rateset, true); return ar5523_cmd_write(ar, WDCMSG_SET_BASIC_RATE, &rates, sizeof(rates), 0); } static int ar5523_create_connection(struct ar5523 *ar, struct ieee80211_vif *vif, struct ieee80211_bss_conf *bss) { struct ar5523_cmd_create_connection create; int wlan_mode; memset(&create, 0, sizeof(create)); create.connid = cpu_to_be32(2); create.bssid = cpu_to_be32(0); /* XXX packed or not? */ create.size = cpu_to_be32(sizeof(struct ar5523_cmd_rateset)); ar5523_create_rateset(ar, bss, &create.connattr.rateset, false); wlan_mode = ar5523_get_wlan_mode(ar, bss); create.connattr.wlanmode = cpu_to_be32(wlan_mode); return ar5523_cmd_write(ar, WDCMSG_CREATE_CONNECTION, &create, sizeof(create), 0); } static int ar5523_write_associd(struct ar5523 *ar, struct ieee80211_vif *vif) { struct ieee80211_bss_conf *bss = &vif->bss_conf; struct ar5523_cmd_set_associd associd; memset(&associd, 0, sizeof(associd)); associd.defaultrateix = cpu_to_be32(0); /* XXX */ associd.associd = cpu_to_be32(vif->cfg.aid); associd.timoffset = cpu_to_be32(0x3b); /* XXX */ memcpy(associd.bssid, bss->bssid, ETH_ALEN); return ar5523_cmd_write(ar, WDCMSG_WRITE_ASSOCID, &associd, sizeof(associd), 0); } static void ar5523_bss_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *bss, u64 changed) { struct ar5523 *ar = hw->priv; int error; ar5523_dbg(ar, "bss_info_changed called\n"); mutex_lock(&ar->mutex); if (!(changed & BSS_CHANGED_ASSOC)) goto out_unlock; if (vif->cfg.assoc) { error = ar5523_create_connection(ar, vif, bss); if (error) { ar5523_err(ar, "could not create connection\n"); goto out_unlock; } error = ar5523_set_basic_rates(ar, bss); if (error) { ar5523_err(ar, "could not set negotiated rate set\n"); goto out_unlock; } error = ar5523_write_associd(ar, vif); if (error) { ar5523_err(ar, "could not set association\n"); goto out_unlock; } /* turn link LED on */ ar5523_set_ledsteady(ar, UATH_LED_LINK, UATH_LED_ON); set_bit(AR5523_CONNECTED, &ar->flags); ieee80211_queue_delayed_work(hw, &ar->stat_work, HZ); } else { cancel_delayed_work(&ar->stat_work); clear_bit(AR5523_CONNECTED, &ar->flags); ar5523_set_ledsteady(ar, UATH_LED_LINK, UATH_LED_OFF); } out_unlock: mutex_unlock(&ar->mutex); } #define AR5523_SUPPORTED_FILTERS (FIF_ALLMULTI | \ FIF_FCSFAIL | \ FIF_OTHER_BSS) static void ar5523_configure_filter(struct ieee80211_hw *hw, unsigned int changed_flags, unsigned int *total_flags, u64 multicast) { struct ar5523 *ar = hw->priv; u32 filter = 0; ar5523_dbg(ar, "configure_filter called\n"); mutex_lock(&ar->mutex); ar5523_flush_tx(ar); *total_flags &= AR5523_SUPPORTED_FILTERS; /* The filters seems strange. UATH_FILTER_RX_BCAST and * UATH_FILTER_RX_MCAST does not result in those frames being RXed. * The only way I have found to get [mb]cast frames seems to be * to set UATH_FILTER_RX_PROM. */ filter |= UATH_FILTER_RX_UCAST | UATH_FILTER_RX_MCAST | UATH_FILTER_RX_BCAST | UATH_FILTER_RX_BEACON | UATH_FILTER_RX_PROM; ar5523_set_rxfilter(ar, 0, UATH_FILTER_OP_INIT); ar5523_set_rxfilter(ar, filter, UATH_FILTER_OP_SET); mutex_unlock(&ar->mutex); } static const struct ieee80211_ops ar5523_ops = { .add_chanctx = ieee80211_emulate_add_chanctx, .remove_chanctx = ieee80211_emulate_remove_chanctx, .change_chanctx = ieee80211_emulate_change_chanctx, .switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx, .start = ar5523_start, .stop = ar5523_stop, .tx = ar5523_tx, .wake_tx_queue = ieee80211_handle_wake_tx_queue, .set_rts_threshold = ar5523_set_rts_threshold, .add_interface = ar5523_add_interface, .remove_interface = ar5523_remove_interface, .config = ar5523_hwconfig, .bss_info_changed = ar5523_bss_info_changed, .configure_filter = ar5523_configure_filter, .flush = ar5523_flush, }; static int ar5523_host_available(struct ar5523 *ar) { struct ar5523_cmd_host_available setup; /* inform target the host is available */ setup.sw_ver_major = cpu_to_be32(ATH_SW_VER_MAJOR); setup.sw_ver_minor = cpu_to_be32(ATH_SW_VER_MINOR); setup.sw_ver_patch = cpu_to_be32(ATH_SW_VER_PATCH); setup.sw_ver_build = cpu_to_be32(ATH_SW_VER_BUILD); return ar5523_cmd_read(ar, WDCMSG_HOST_AVAILABLE, &setup, sizeof(setup), NULL, 0, 0); } static int ar5523_get_devstatus(struct ar5523 *ar) { u8 macaddr[ETH_ALEN]; int error; /* retrieve MAC address */ error = ar5523_get_status(ar, ST_MAC_ADDR, macaddr, ETH_ALEN); if (error) { ar5523_err(ar, "could not read MAC address\n"); return error; } SET_IEEE80211_PERM_ADDR(ar->hw, macaddr); error = ar5523_get_status(ar, ST_SERIAL_NUMBER, &ar->serial[0], sizeof(ar->serial)); if (error) { ar5523_err(ar, "could not read device serial number\n"); return error; } return 0; } #define AR5523_SANE_RXBUFSZ 2000 static int ar5523_get_max_rxsz(struct ar5523 *ar) { int error; __be32 rxsize; /* Get max rx size */ error = ar5523_get_status(ar, ST_WDC_TRANSPORT_CHUNK_SIZE, &rxsize, sizeof(rxsize)); if (error != 0) { ar5523_err(ar, "could not read max RX size\n"); return error; } ar->rxbufsz = be32_to_cpu(rxsize); if (!ar->rxbufsz || ar->rxbufsz > AR5523_SANE_RXBUFSZ) { ar5523_err(ar, "Bad rxbufsz from device. Using %d instead\n", AR5523_SANE_RXBUFSZ); ar->rxbufsz = AR5523_SANE_RXBUFSZ; } ar5523_dbg(ar, "Max RX buf size: %d\n", ar->rxbufsz); return 0; } /* * This is copied from rtl818x, but we should probably move this * to common code as in OpenBSD. */ static const struct ieee80211_rate ar5523_rates[] = { { .bitrate = 10, .hw_value = 2, }, { .bitrate = 20, .hw_value = 4 }, { .bitrate = 55, .hw_value = 11, }, { .bitrate = 110, .hw_value = 22, }, { .bitrate = 60, .hw_value = 12, }, { .bitrate = 90, .hw_value = 18, }, { .bitrate = 120, .hw_value = 24, }, { .bitrate = 180, .hw_value = 36, }, { .bitrate = 240, .hw_value = 48, }, { .bitrate = 360, .hw_value = 72, }, { .bitrate = 480, .hw_value = 96, }, { .bitrate = 540, .hw_value = 108, }, }; static const struct ieee80211_channel ar5523_channels[] = { { .center_freq = 2412 }, { .center_freq = 2417 }, { .center_freq = 2422 }, { .center_freq = 2427 }, { .center_freq = 2432 }, { .center_freq = 2437 }, { .center_freq = 2442 }, { .center_freq = 2447 }, { .center_freq = 2452 }, { .center_freq = 2457 }, { .center_freq = 2462 }, { .center_freq = 2467 }, { .center_freq = 2472 }, { .center_freq = 2484 }, }; static int ar5523_init_modes(struct ar5523 *ar) { BUILD_BUG_ON(sizeof(ar->channels) != sizeof(ar5523_channels)); BUILD_BUG_ON(sizeof(ar->rates) != sizeof(ar5523_rates)); memcpy(ar->channels, ar5523_channels, sizeof(ar5523_channels)); memcpy(ar->rates, ar5523_rates, sizeof(ar5523_rates)); ar->band.band = NL80211_BAND_2GHZ; ar->band.channels = ar->channels; ar->band.n_channels = ARRAY_SIZE(ar5523_channels); ar->band.bitrates = ar->rates; ar->band.n_bitrates = ARRAY_SIZE(ar5523_rates); ar->hw->wiphy->bands[NL80211_BAND_2GHZ] = &ar->band; return 0; } /* * Load the MIPS R4000 microcode into the device. Once the image is loaded, * the device will detach itself from the bus and reattach later with a new * product Id (a la ezusb). */ static int ar5523_load_firmware(struct usb_device *dev) { struct ar5523_fwblock *txblock, *rxblock; const struct firmware *fw; void *fwbuf; int len, offset; int foolen; /* XXX(hch): handle short transfers */ int error = -ENXIO; if (request_firmware(&fw, AR5523_FIRMWARE_FILE, &dev->dev)) { dev_err(&dev->dev, "no firmware found: %s\n", AR5523_FIRMWARE_FILE); return -ENOENT; } txblock = kzalloc(sizeof(*txblock), GFP_KERNEL); if (!txblock) goto out; rxblock = kmalloc(sizeof(*rxblock), GFP_KERNEL); if (!rxblock) goto out_free_txblock; fwbuf = kmalloc(AR5523_MAX_FWBLOCK_SIZE, GFP_KERNEL); if (!fwbuf) goto out_free_rxblock; txblock->flags = cpu_to_be32(AR5523_WRITE_BLOCK); txblock->total = cpu_to_be32(fw->size); offset = 0; len = fw->size; while (len > 0) { int mlen = min(len, AR5523_MAX_FWBLOCK_SIZE); txblock->remain = cpu_to_be32(len - mlen); txblock->len = cpu_to_be32(mlen); /* send firmware block meta-data */ error = usb_bulk_msg(dev, ar5523_cmd_tx_pipe(dev), txblock, sizeof(*txblock), &foolen, AR5523_CMD_TIMEOUT); if (error) { dev_err(&dev->dev, "could not send firmware block info\n"); goto out_free_fwbuf; } /* send firmware block data */ memcpy(fwbuf, fw->data + offset, mlen); error = usb_bulk_msg(dev, ar5523_data_tx_pipe(dev), fwbuf, mlen, &foolen, AR5523_DATA_TIMEOUT); if (error) { dev_err(&dev->dev, "could not send firmware block data\n"); goto out_free_fwbuf; } /* wait for ack from firmware */ error = usb_bulk_msg(dev, ar5523_cmd_rx_pipe(dev), rxblock, sizeof(*rxblock), &foolen, AR5523_CMD_TIMEOUT); if (error) { dev_err(&dev->dev, "could not read firmware answer\n"); goto out_free_fwbuf; } len -= mlen; offset += mlen; } /* * Set the error to -ENXIO to make sure we continue probing for * a driver. */ error = -ENXIO; out_free_fwbuf: kfree(fwbuf); out_free_rxblock: kfree(rxblock); out_free_txblock: kfree(txblock); out: release_firmware(fw); return error; } static int ar5523_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_device *dev = interface_to_usbdev(intf); struct ieee80211_hw *hw; struct ar5523 *ar; int error = -ENOMEM; static const u8 bulk_ep_addr[] = { AR5523_CMD_TX_PIPE | USB_DIR_OUT, AR5523_DATA_TX_PIPE | USB_DIR_OUT, AR5523_CMD_RX_PIPE | USB_DIR_IN, AR5523_DATA_RX_PIPE | USB_DIR_IN, 0}; if (!usb_check_bulk_endpoints(intf, bulk_ep_addr)) { dev_err(&dev->dev, "Could not find all expected endpoints\n"); error = -ENODEV; goto out; } /* * Load firmware if the device requires it. This will return * -ENXIO on success and we'll get called back afer the usb * id changes to indicate that the firmware is present. */ if (id->driver_info & AR5523_FLAG_PRE_FIRMWARE) return ar5523_load_firmware(dev); hw = ieee80211_alloc_hw(sizeof(*ar), &ar5523_ops); if (!hw) goto out; SET_IEEE80211_DEV(hw, &intf->dev); ar = hw->priv; ar->hw = hw; ar->dev = dev; mutex_init(&ar->mutex); INIT_DELAYED_WORK(&ar->stat_work, ar5523_stat_work); timer_setup(&ar->tx_wd_timer, ar5523_tx_wd_timer, 0); INIT_WORK(&ar->tx_wd_work, ar5523_tx_wd_work); INIT_WORK(&ar->tx_work, ar5523_tx_work); INIT_LIST_HEAD(&ar->tx_queue_pending); INIT_LIST_HEAD(&ar->tx_queue_submitted); spin_lock_init(&ar->tx_data_list_lock); atomic_set(&ar->tx_nr_total, 0); atomic_set(&ar->tx_nr_pending, 0); init_waitqueue_head(&ar->tx_flush_waitq); atomic_set(&ar->rx_data_free_cnt, 0); INIT_WORK(&ar->rx_refill_work, ar5523_rx_refill_work); INIT_LIST_HEAD(&ar->rx_data_free); INIT_LIST_HEAD(&ar->rx_data_used); spin_lock_init(&ar->rx_data_list_lock); ar->wq = create_singlethread_workqueue("ar5523"); if (!ar->wq) { ar5523_err(ar, "Could not create wq\n"); goto out_free_ar; } error = ar5523_alloc_rx_bufs(ar); if (error) { ar5523_err(ar, "Could not allocate rx buffers\n"); goto out_free_wq; } error = ar5523_alloc_rx_cmd(ar); if (error) { ar5523_err(ar, "Could not allocate rx command buffers\n"); goto out_free_rx_bufs; } error = ar5523_alloc_tx_cmd(ar); if (error) { ar5523_err(ar, "Could not allocate tx command buffers\n"); goto out_free_rx_cmd; } error = ar5523_submit_rx_cmd(ar); if (error) { ar5523_err(ar, "Failed to submit rx cmd\n"); goto out_free_tx_cmd; } /* * We're now ready to send/receive firmware commands. */ error = ar5523_host_available(ar); if (error) { ar5523_err(ar, "could not initialize adapter\n"); goto out_cancel_rx_cmd; } error = ar5523_get_max_rxsz(ar); if (error) { ar5523_err(ar, "could not get caps from adapter\n"); goto out_cancel_rx_cmd; } error = ar5523_get_devcap(ar); if (error) { ar5523_err(ar, "could not get caps from adapter\n"); goto out_cancel_rx_cmd; } error = ar5523_get_devstatus(ar); if (error != 0) { ar5523_err(ar, "could not get device status\n"); goto out_cancel_rx_cmd; } ar5523_info(ar, "MAC/BBP AR5523, RF AR%c112\n", (id->driver_info & AR5523_FLAG_ABG) ? '5' : '2'); ar->vif = NULL; ieee80211_hw_set(hw, HAS_RATE_CONTROL); ieee80211_hw_set(hw, RX_INCLUDES_FCS); ieee80211_hw_set(hw, SIGNAL_DBM); hw->extra_tx_headroom = sizeof(struct ar5523_tx_desc) + sizeof(struct ar5523_chunk); hw->wiphy->interface_modes = BIT(NL80211_IFTYPE_STATION); hw->queues = 1; error = ar5523_init_modes(ar); if (error) goto out_cancel_rx_cmd; wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_CQM_RSSI_LIST); usb_set_intfdata(intf, hw); error = ieee80211_register_hw(hw); if (error) { ar5523_err(ar, "could not register device\n"); goto out_cancel_rx_cmd; } ar5523_info(ar, "Found and initialized AR5523 device\n"); return 0; out_cancel_rx_cmd: ar5523_cancel_rx_cmd(ar); out_free_tx_cmd: ar5523_free_tx_cmd(ar); out_free_rx_cmd: ar5523_free_rx_cmd(ar); out_free_rx_bufs: ar5523_free_rx_bufs(ar); out_free_wq: destroy_workqueue(ar->wq); out_free_ar: ieee80211_free_hw(hw); out: return error; } static void ar5523_disconnect(struct usb_interface *intf) { struct ieee80211_hw *hw = usb_get_intfdata(intf); struct ar5523 *ar = hw->priv; ar5523_dbg(ar, "detaching\n"); set_bit(AR5523_USB_DISCONNECTED, &ar->flags); ieee80211_unregister_hw(hw); ar5523_cancel_rx_cmd(ar); ar5523_free_tx_cmd(ar); ar5523_free_rx_cmd(ar); ar5523_free_rx_bufs(ar); destroy_workqueue(ar->wq); ieee80211_free_hw(hw); usb_set_intfdata(intf, NULL); } #define AR5523_DEVICE_UG(vendor, device) \ { USB_DEVICE((vendor), (device)) }, \ { USB_DEVICE((vendor), (device) + 1), \ .driver_info = AR5523_FLAG_PRE_FIRMWARE } #define AR5523_DEVICE_UX(vendor, device) \ { USB_DEVICE((vendor), (device)), \ .driver_info = AR5523_FLAG_ABG }, \ { USB_DEVICE((vendor), (device) + 1), \ .driver_info = AR5523_FLAG_ABG|AR5523_FLAG_PRE_FIRMWARE } static const struct usb_device_id ar5523_id_table[] = { AR5523_DEVICE_UG(0x168c, 0x0001), /* Atheros / AR5523 */ AR5523_DEVICE_UG(0x0cf3, 0x0001), /* Atheros2 / AR5523_1 */ AR5523_DEVICE_UG(0x0cf3, 0x0003), /* Atheros2 / AR5523_2 */ AR5523_DEVICE_UX(0x0cf3, 0x0005), /* Atheros2 / AR5523_3 */ AR5523_DEVICE_UG(0x0d8e, 0x7801), /* Conceptronic / AR5523_1 */ AR5523_DEVICE_UX(0x0d8e, 0x7811), /* Conceptronic / AR5523_2 */ AR5523_DEVICE_UX(0x2001, 0x3a00), /* Dlink / DWLAG132 */ AR5523_DEVICE_UG(0x2001, 0x3a02), /* Dlink / DWLG132 */ AR5523_DEVICE_UX(0x2001, 0x3a04), /* Dlink / DWLAG122 */ AR5523_DEVICE_UG(0x07d1, 0x3a07), /* D-Link / WUA-2340 rev A1 */ AR5523_DEVICE_UG(0x1690, 0x0712), /* Gigaset / AR5523 */ AR5523_DEVICE_UG(0x1690, 0x0710), /* Gigaset / SMCWUSBTG */ AR5523_DEVICE_UG(0x129b, 0x160b), /* Gigaset / USB stick 108 (CyberTAN Technology) */ AR5523_DEVICE_UG(0x16ab, 0x7801), /* Globalsun / AR5523_1 */ AR5523_DEVICE_UX(0x16ab, 0x7811), /* Globalsun / AR5523_2 */ AR5523_DEVICE_UG(0x0d8e, 0x7802), /* Globalsun / AR5523_3 */ AR5523_DEVICE_UX(0x0846, 0x4300), /* Netgear / WG111U */ AR5523_DEVICE_UG(0x0846, 0x4250), /* Netgear / WG111T */ AR5523_DEVICE_UG(0x0846, 0x5f00), /* Netgear / WPN111 */ AR5523_DEVICE_UG(0x083a, 0x4506), /* SMC / EZ Connect SMCWUSBT-G2 */ AR5523_DEVICE_UG(0x157e, 0x3006), /* Umedia / AR5523_1, TEW444UBEU*/ AR5523_DEVICE_UX(0x157e, 0x3205), /* Umedia / AR5523_2 */ AR5523_DEVICE_UG(0x1435, 0x0826), /* Wistronneweb / AR5523_1 */ AR5523_DEVICE_UX(0x1435, 0x0828), /* Wistronneweb / AR5523_2 */ AR5523_DEVICE_UG(0x0cde, 0x0012), /* Zcom / AR5523 */ AR5523_DEVICE_UG(0x1385, 0x4250), /* Netgear3 / WG111T (2) */ AR5523_DEVICE_UG(0x1385, 0x5f00), /* Netgear / WPN111 */ AR5523_DEVICE_UG(0x1385, 0x5f02), /* Netgear / WPN111 */ { } }; MODULE_DEVICE_TABLE(usb, ar5523_id_table); static struct usb_driver ar5523_driver = { .name = "ar5523", .id_table = ar5523_id_table, .probe = ar5523_probe, .disconnect = ar5523_disconnect, }; module_usb_driver(ar5523_driver); MODULE_DESCRIPTION("Atheros AR5523 wireless driver"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_FIRMWARE(AR5523_FIRMWARE_FILE);
21 13 20 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 /* * linux/fs/nls/nls_cp857.c * * Charset cp857 translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, 0x00ea, 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x0131, 0x00c4, 0x00c5, /* 0x90*/ 0x00c9, 0x00e6, 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9, 0x0130, 0x00d6, 0x00dc, 0x00f8, 0x00a3, 0x00d8, 0x015e, 0x015f, /* 0xa0*/ 0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x00f1, 0x00d1, 0x011e, 0x011f, 0x00bf, 0x00ae, 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb, /* 0xb0*/ 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00c1, 0x00c2, 0x00c0, 0x00a9, 0x2563, 0x2551, 0x2557, 0x255d, 0x00a2, 0x00a5, 0x2510, /* 0xc0*/ 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x00e3, 0x00c3, 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4, /* 0xd0*/ 0x00ba, 0x00aa, 0x00ca, 0x00cb, 0x00c8, 0x0000, 0x00cd, 0x00ce, 0x00cf, 0x2518, 0x250c, 0x2588, 0x2584, 0x00a6, 0x00cc, 0x2580, /* 0xe0*/ 0x00d3, 0x00df, 0x00d4, 0x00d2, 0x00f5, 0x00d5, 0x00b5, 0x0000, 0x00d7, 0x00da, 0x00db, 0x00d9, 0x00ec, 0x00ff, 0x00af, 0x00b4, /* 0xf0*/ 0x00ad, 0x00b1, 0x0000, 0x00be, 0x00b6, 0x00a7, 0x00f7, 0x00b8, 0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2, 0x25a0, 0x00a0, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xff, 0xad, 0xbd, 0x9c, 0xcf, 0xbe, 0xdd, 0xf5, /* 0xa0-0xa7 */ 0xf9, 0xb8, 0xd1, 0xae, 0xaa, 0xf0, 0xa9, 0xee, /* 0xa8-0xaf */ 0xf8, 0xf1, 0xfd, 0xfc, 0xef, 0xe6, 0xf4, 0xfa, /* 0xb0-0xb7 */ 0xf7, 0xfb, 0xd0, 0xaf, 0xac, 0xab, 0xf3, 0xa8, /* 0xb8-0xbf */ 0xb7, 0xb5, 0xb6, 0xc7, 0x8e, 0x8f, 0x92, 0x80, /* 0xc0-0xc7 */ 0xd4, 0x90, 0xd2, 0xd3, 0xde, 0xd6, 0xd7, 0xd8, /* 0xc8-0xcf */ 0x00, 0xa5, 0xe3, 0xe0, 0xe2, 0xe5, 0x99, 0xe8, /* 0xd0-0xd7 */ 0x9d, 0xeb, 0xe9, 0xea, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */ 0x85, 0xa0, 0x83, 0xc6, 0x84, 0x86, 0x91, 0x87, /* 0xe0-0xe7 */ 0x8a, 0x82, 0x88, 0x89, 0xec, 0xa1, 0x8c, 0x8b, /* 0xe8-0xef */ 0x00, 0xa4, 0x95, 0xa2, 0x93, 0xe4, 0x94, 0xf6, /* 0xf0-0xf7 */ 0x9b, 0x97, 0xa3, 0x96, 0x81, 0x00, 0x00, 0xed, /* 0xf8-0xff */ }; static const unsigned char page01[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xa6, 0xa7, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x98, 0x8d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9e, 0x9f, /* 0x58-0x5f */ }; static const unsigned char page25[256] = { 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */ 0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */ 0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */ 0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0xdb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ }; static const unsigned char *const page_uni2charset[256] = { page00, page01, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page25, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x84, 0x86, /* 0x88-0x8f */ 0x82, 0x91, 0x91, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x69, 0x94, 0x81, 0x9b, 0x9c, 0x9b, 0x9f, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa4, 0xa7, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xa0, 0x83, 0x85, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc6, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0x88, 0x89, 0x8a, 0x00, 0xa1, 0x8c, /* 0xd0-0xd7 */ 0x8b, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xec, 0xdf, /* 0xd8-0xdf */ 0xa2, 0xe1, 0x93, 0x95, 0xe4, 0xe4, 0xe6, 0x00, /* 0xe0-0xe7 */ 0xe8, 0xa3, 0x96, 0x97, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0x00, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x9a, 0x90, 0xb6, 0x8e, 0xb7, 0x8f, 0x80, /* 0x80-0x87 */ 0xd2, 0xd3, 0xd4, 0xd8, 0xd7, 0x49, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x92, 0x92, 0xe2, 0x99, 0xe3, 0xea, 0xeb, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9d, 0x9c, 0x9d, 0x9e, 0x9e, /* 0x98-0x9f */ 0xb5, 0xd6, 0xe0, 0xe9, 0xa5, 0xa5, 0xa6, 0xa6, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc7, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0x00, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe5, 0xe5, 0x00, 0x00, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xde, 0x00, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0x00, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "cp857", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_cp857(void) { return register_nls(&table); } static void __exit exit_nls_cp857(void) { unregister_nls(&table); } module_init(init_nls_cp857) module_exit(exit_nls_cp857) MODULE_DESCRIPTION("NLS Codepage 857 (Turkish)"); MODULE_LICENSE("Dual BSD/GPL");
11 11 11 11 11 11 11 11 10 3 11 11 11 11 11 11 11 11 11 11 11 10 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 10 11 11 11 11 11 11 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 // SPDX-License-Identifier: GPL-2.0+ /* * NILFS segment buffer * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * * Written by Ryusuke Konishi. * */ #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/crc32.h> #include <linux/backing-dev.h> #include <linux/slab.h> #include "page.h" #include "segbuf.h" struct nilfs_write_info { struct the_nilfs *nilfs; struct bio *bio; int start, end; /* The region to be submitted */ int rest_blocks; int max_pages; int nr_vecs; sector_t blocknr; }; static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, struct the_nilfs *nilfs); static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf); struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb) { struct nilfs_segment_buffer *segbuf; segbuf = kmem_cache_alloc(nilfs_segbuf_cachep, GFP_NOFS); if (unlikely(!segbuf)) return NULL; segbuf->sb_super = sb; INIT_LIST_HEAD(&segbuf->sb_list); INIT_LIST_HEAD(&segbuf->sb_segsum_buffers); INIT_LIST_HEAD(&segbuf->sb_payload_buffers); segbuf->sb_super_root = NULL; init_completion(&segbuf->sb_bio_event); atomic_set(&segbuf->sb_err, 0); segbuf->sb_nbio = 0; return segbuf; } void nilfs_segbuf_free(struct nilfs_segment_buffer *segbuf) { kmem_cache_free(nilfs_segbuf_cachep, segbuf); } void nilfs_segbuf_map(struct nilfs_segment_buffer *segbuf, __u64 segnum, unsigned long offset, struct the_nilfs *nilfs) { segbuf->sb_segnum = segnum; nilfs_get_segment_range(nilfs, segnum, &segbuf->sb_fseg_start, &segbuf->sb_fseg_end); segbuf->sb_pseg_start = segbuf->sb_fseg_start + offset; segbuf->sb_rest_blocks = segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1; } /** * nilfs_segbuf_map_cont - map a new log behind a given log * @segbuf: new segment buffer * @prev: segment buffer containing a log to be continued */ void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf, struct nilfs_segment_buffer *prev) { segbuf->sb_segnum = prev->sb_segnum; segbuf->sb_fseg_start = prev->sb_fseg_start; segbuf->sb_fseg_end = prev->sb_fseg_end; segbuf->sb_pseg_start = prev->sb_pseg_start + prev->sb_sum.nblocks; segbuf->sb_rest_blocks = segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1; } void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *segbuf, __u64 nextnum, struct the_nilfs *nilfs) { segbuf->sb_nextnum = nextnum; segbuf->sb_sum.next = nilfs_get_segment_start_blocknr(nilfs, nextnum); } int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *segbuf) { struct buffer_head *bh; bh = sb_getblk(segbuf->sb_super, segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk); if (unlikely(!bh)) return -ENOMEM; lock_buffer(bh); if (!buffer_uptodate(bh)) { memset(bh->b_data, 0, bh->b_size); set_buffer_uptodate(bh); } unlock_buffer(bh); nilfs_segbuf_add_segsum_buffer(segbuf, bh); return 0; } int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf, struct buffer_head **bhp) { struct buffer_head *bh; bh = sb_getblk(segbuf->sb_super, segbuf->sb_pseg_start + segbuf->sb_sum.nblocks); if (unlikely(!bh)) return -ENOMEM; nilfs_segbuf_add_payload_buffer(segbuf, bh); *bhp = bh; return 0; } int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned int flags, time64_t ctime, __u64 cno) { int err; segbuf->sb_sum.nblocks = segbuf->sb_sum.nsumblk = 0; err = nilfs_segbuf_extend_segsum(segbuf); if (unlikely(err)) return err; segbuf->sb_sum.flags = flags; segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary); segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0; segbuf->sb_sum.ctime = ctime; segbuf->sb_sum.cno = cno; return 0; } /* * Setup segment summary */ void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf) { struct nilfs_segment_summary *raw_sum; struct buffer_head *bh_sum; bh_sum = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head, b_assoc_buffers); raw_sum = (struct nilfs_segment_summary *)bh_sum->b_data; raw_sum->ss_magic = cpu_to_le32(NILFS_SEGSUM_MAGIC); raw_sum->ss_bytes = cpu_to_le16(sizeof(*raw_sum)); raw_sum->ss_flags = cpu_to_le16(segbuf->sb_sum.flags); raw_sum->ss_seq = cpu_to_le64(segbuf->sb_sum.seg_seq); raw_sum->ss_create = cpu_to_le64(segbuf->sb_sum.ctime); raw_sum->ss_next = cpu_to_le64(segbuf->sb_sum.next); raw_sum->ss_nblocks = cpu_to_le32(segbuf->sb_sum.nblocks); raw_sum->ss_nfinfo = cpu_to_le32(segbuf->sb_sum.nfinfo); raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes); raw_sum->ss_pad = 0; raw_sum->ss_cno = cpu_to_le64(segbuf->sb_sum.cno); } /* * CRC calculation routines */ static void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf, u32 seed) { struct buffer_head *bh; struct nilfs_segment_summary *raw_sum; unsigned long size, bytes = segbuf->sb_sum.sumbytes; u32 crc; bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head, b_assoc_buffers); raw_sum = (struct nilfs_segment_summary *)bh->b_data; size = min_t(unsigned long, bytes, bh->b_size); crc = crc32_le(seed, (unsigned char *)raw_sum + sizeof(raw_sum->ss_datasum) + sizeof(raw_sum->ss_sumsum), size - (sizeof(raw_sum->ss_datasum) + sizeof(raw_sum->ss_sumsum))); list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { bytes -= size; size = min_t(unsigned long, bytes, bh->b_size); crc = crc32_le(crc, bh->b_data, size); } raw_sum->ss_sumsum = cpu_to_le32(crc); } static void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf, u32 seed) { struct buffer_head *bh; struct nilfs_segment_summary *raw_sum; u32 crc; bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head, b_assoc_buffers); raw_sum = (struct nilfs_segment_summary *)bh->b_data; crc = crc32_le(seed, (unsigned char *)raw_sum + sizeof(raw_sum->ss_datasum), bh->b_size - sizeof(raw_sum->ss_datasum)); list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { crc = crc32_le(crc, bh->b_data, bh->b_size); } list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { size_t offset = offset_in_folio(bh->b_folio, bh->b_data); unsigned char *from; /* Do not support block sizes larger than PAGE_SIZE */ from = kmap_local_folio(bh->b_folio, offset); crc = crc32_le(crc, from, bh->b_size); kunmap_local(from); } raw_sum->ss_datasum = cpu_to_le32(crc); } static void nilfs_segbuf_fill_in_super_root_crc(struct nilfs_segment_buffer *segbuf, u32 seed) { struct nilfs_super_root *raw_sr; struct the_nilfs *nilfs = segbuf->sb_super->s_fs_info; unsigned int srsize; u32 crc; raw_sr = (struct nilfs_super_root *)segbuf->sb_super_root->b_data; srsize = NILFS_SR_BYTES(nilfs->ns_inode_size); crc = crc32_le(seed, (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum), srsize - sizeof(raw_sr->sr_sum)); raw_sr->sr_sum = cpu_to_le32(crc); } static void nilfs_release_buffers(struct list_head *list) { struct buffer_head *bh, *n; list_for_each_entry_safe(bh, n, list, b_assoc_buffers) { list_del_init(&bh->b_assoc_buffers); brelse(bh); } } static void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf) { nilfs_release_buffers(&segbuf->sb_segsum_buffers); nilfs_release_buffers(&segbuf->sb_payload_buffers); segbuf->sb_super_root = NULL; } /* * Iterators for segment buffers */ void nilfs_clear_logs(struct list_head *logs) { struct nilfs_segment_buffer *segbuf; list_for_each_entry(segbuf, logs, sb_list) nilfs_segbuf_clear(segbuf); } void nilfs_truncate_logs(struct list_head *logs, struct nilfs_segment_buffer *last) { struct nilfs_segment_buffer *n, *segbuf; segbuf = list_prepare_entry(last, logs, sb_list); list_for_each_entry_safe_continue(segbuf, n, logs, sb_list) { list_del_init(&segbuf->sb_list); nilfs_segbuf_clear(segbuf); nilfs_segbuf_free(segbuf); } } int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs) { struct nilfs_segment_buffer *segbuf; int ret = 0; list_for_each_entry(segbuf, logs, sb_list) { ret = nilfs_segbuf_write(segbuf, nilfs); if (ret) break; } return ret; } int nilfs_wait_on_logs(struct list_head *logs) { struct nilfs_segment_buffer *segbuf; int err, ret = 0; list_for_each_entry(segbuf, logs, sb_list) { err = nilfs_segbuf_wait(segbuf); if (err && !ret) ret = err; } return ret; } /** * nilfs_add_checksums_on_logs - add checksums on the logs * @logs: list of segment buffers storing target logs * @seed: checksum seed value */ void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed) { struct nilfs_segment_buffer *segbuf; list_for_each_entry(segbuf, logs, sb_list) { if (segbuf->sb_super_root) nilfs_segbuf_fill_in_super_root_crc(segbuf, seed); nilfs_segbuf_fill_in_segsum_crc(segbuf, seed); nilfs_segbuf_fill_in_data_crc(segbuf, seed); } } /* * BIO operations */ static void nilfs_end_bio_write(struct bio *bio) { struct nilfs_segment_buffer *segbuf = bio->bi_private; if (bio->bi_status) atomic_inc(&segbuf->sb_err); bio_put(bio); complete(&segbuf->sb_bio_event); } static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf, struct nilfs_write_info *wi) { struct bio *bio = wi->bio; bio->bi_end_io = nilfs_end_bio_write; bio->bi_private = segbuf; submit_bio(bio); segbuf->sb_nbio++; wi->bio = NULL; wi->rest_blocks -= wi->end - wi->start; wi->nr_vecs = min(wi->max_pages, wi->rest_blocks); wi->start = wi->end; return 0; } static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf, struct nilfs_write_info *wi) { wi->bio = NULL; wi->rest_blocks = segbuf->sb_sum.nblocks; wi->max_pages = BIO_MAX_VECS; wi->nr_vecs = min(wi->max_pages, wi->rest_blocks); wi->start = wi->end = 0; wi->blocknr = segbuf->sb_pseg_start; } static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf, struct nilfs_write_info *wi, struct buffer_head *bh) { int err; BUG_ON(wi->nr_vecs <= 0); repeat: if (!wi->bio) { wi->bio = bio_alloc(wi->nilfs->ns_bdev, wi->nr_vecs, REQ_OP_WRITE, GFP_NOIO); wi->bio->bi_iter.bi_sector = (wi->blocknr + wi->end) << (wi->nilfs->ns_blocksize_bits - 9); } if (bio_add_folio(wi->bio, bh->b_folio, bh->b_size, offset_in_folio(bh->b_folio, bh->b_data))) { wi->end++; return 0; } /* bio is FULL */ err = nilfs_segbuf_submit_bio(segbuf, wi); /* never submit current bh */ if (likely(!err)) goto repeat; return err; } /** * nilfs_segbuf_write - submit write requests of a log * @segbuf: buffer storing a log to be written * @nilfs: nilfs object * * Return: Always 0. */ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, struct the_nilfs *nilfs) { struct nilfs_write_info wi; struct buffer_head *bh; int res = 0; wi.nilfs = nilfs; nilfs_segbuf_prepare_write(segbuf, &wi); list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { res = nilfs_segbuf_submit_bh(segbuf, &wi, bh); if (unlikely(res)) goto failed_bio; } list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { res = nilfs_segbuf_submit_bh(segbuf, &wi, bh); if (unlikely(res)) goto failed_bio; } if (wi.bio) { /* * Last BIO is always sent through the following * submission. */ wi.bio->bi_opf |= REQ_SYNC; res = nilfs_segbuf_submit_bio(segbuf, &wi); } failed_bio: return res; } /** * nilfs_segbuf_wait - wait for completion of requested BIOs * @segbuf: segment buffer * * Return: 0 on success, or %-EIO if I/O error is detected. */ static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf) { int err = 0; if (!segbuf->sb_nbio) return 0; do { wait_for_completion(&segbuf->sb_bio_event); } while (--segbuf->sb_nbio > 0); if (unlikely(atomic_read(&segbuf->sb_err) > 0)) { nilfs_err(segbuf->sb_super, "I/O error writing log (start-blocknr=%llu, block-count=%lu) in segment %llu", (unsigned long long)segbuf->sb_pseg_start, segbuf->sb_sum.nblocks, (unsigned long long)segbuf->sb_segnum); err = -EIO; } return err; }
68 406 52 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * * This file is part of the SCTP kernel implementation * * Please send any bug reports or fixes you make to the * email addresses: * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Randall Stewart <randall@sctp.chicago.il.us> * Ken Morneau <kmorneau@cisco.com> * Qiaobing Xie <qxie1@email.mot.com> * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@us.ibm.com> * Xingang Guo <xingang.guo@intel.com> * Hui Huang <hui.huang@nokia.com> * Sridhar Samudrala <sri@us.ibm.com> * Daisy Chang <daisyc@us.ibm.com> * Dajiang Zhang <dajiang.zhang@nokia.com> * Ardelle Fan <ardelle.fan@intel.com> * Ryan Layer <rmlayer@us.ibm.com> * Anup Pemmaiah <pemmaiah@cc.usu.edu> * Kevin Gao <kevin.gao@intel.com> */ #ifndef __sctp_structs_h__ #define __sctp_structs_h__ #include <crypto/sha2.h> #include <linux/ktime.h> #include <linux/generic-radix-tree.h> #include <linux/rhashtable-types.h> #include <linux/socket.h> /* linux/in.h needs this!! */ #include <linux/in.h> /* We get struct sockaddr_in. */ #include <linux/in6.h> /* We get struct in6_addr */ #include <linux/ipv6.h> #include <asm/param.h> /* We get MAXHOSTNAMELEN. */ #include <linux/atomic.h> /* This gets us atomic counters. */ #include <linux/skbuff.h> /* We need sk_buff_head. */ #include <linux/workqueue.h> /* We need tq_struct. */ #include <linux/sctp.h> /* We need sctp* header structs. */ #include <net/sctp/auth.h> /* We need auth specific structs */ #include <net/ip.h> /* For inet_skb_parm */ /* A convenience structure for handling sockaddr structures. * We should wean ourselves off this. */ union sctp_addr { struct sockaddr_inet sa; /* Large enough for both address families */ struct sockaddr_in v4; struct sockaddr_in6 v6; }; /* Forward declarations for data structures. */ struct sctp_globals; struct sctp_endpoint; struct sctp_association; struct sctp_transport; struct sctp_packet; struct sctp_chunk; struct sctp_inq; struct sctp_outq; struct sctp_bind_addr; struct sctp_ulpq; struct sctp_ep_common; struct sctp_stream; #include <net/sctp/tsnmap.h> #include <net/sctp/ulpevent.h> #include <net/sctp/ulpqueue.h> #include <net/sctp/stream_interleave.h> /* Structures useful for managing bind/connect. */ struct sctp_bind_bucket { unsigned short port; signed char fastreuse; signed char fastreuseport; kuid_t fastuid; struct hlist_node node; struct hlist_head owner; struct net *net; }; struct sctp_bind_hashbucket { spinlock_t lock; struct hlist_head chain; }; /* Used for hashing all associations. */ struct sctp_hashbucket { rwlock_t lock; struct hlist_head chain; } __attribute__((__aligned__(8))); /* The SCTP globals structure. */ extern struct sctp_globals { /* This is a list of groups of functions for each address * family that we support. */ struct list_head address_families; /* This is the hash of all endpoints. */ struct sctp_hashbucket *ep_hashtable; /* This is the sctp port control hash. */ struct sctp_bind_hashbucket *port_hashtable; /* This is the hash of all transports. */ struct rhltable transport_hashtable; /* Sizes of above hashtables. */ int ep_hashsize; int port_hashsize; /* Default initialization values to be applied to new associations. */ __u16 max_instreams; __u16 max_outstreams; /* Flag to indicate whether computing and verifying checksum * is disabled. */ bool checksum_disable; } sctp_globals; #define sctp_max_instreams (sctp_globals.max_instreams) #define sctp_max_outstreams (sctp_globals.max_outstreams) #define sctp_address_families (sctp_globals.address_families) #define sctp_ep_hashsize (sctp_globals.ep_hashsize) #define sctp_ep_hashtable (sctp_globals.ep_hashtable) #define sctp_port_hashsize (sctp_globals.port_hashsize) #define sctp_port_hashtable (sctp_globals.port_hashtable) #define sctp_transport_hashtable (sctp_globals.transport_hashtable) #define sctp_checksum_disable (sctp_globals.checksum_disable) /* SCTP Socket type: UDP or TCP style. */ enum sctp_socket_type { SCTP_SOCKET_UDP = 0, SCTP_SOCKET_UDP_HIGH_BANDWIDTH, SCTP_SOCKET_TCP }; /* Per socket SCTP information. */ struct sctp_sock { /* inet_sock has to be the first member of sctp_sock */ struct inet_sock inet; /* What kind of a socket is this? */ enum sctp_socket_type type; /* PF_ family specific functions. */ struct sctp_pf *pf; /* What is our base endpointer? */ struct sctp_endpoint *ep; struct sctp_bind_bucket *bind_hash; /* Various Socket Options. */ __u16 default_stream; __u32 default_ppid; __u16 default_flags; __u32 default_context; __u32 default_timetolive; __u32 default_rcv_context; int max_burst; /* Heartbeat interval: The endpoint sends out a Heartbeat chunk to * the destination address every heartbeat interval. This value * will be inherited by all new associations. */ __u32 hbinterval; __u32 probe_interval; __be16 udp_port; __be16 encap_port; /* This is the max_retrans value for new associations. */ __u16 pathmaxrxt; __u32 flowlabel; __u8 dscp; __u16 pf_retrans; __u16 ps_retrans; /* The initial Path MTU to use for new associations. */ __u32 pathmtu; /* The default SACK delay timeout for new associations. */ __u32 sackdelay; __u32 sackfreq; /* Flags controlling Heartbeat, SACK delay, and Path MTU Discovery. */ __u32 param_flags; __u32 default_ss; struct sctp_rtoinfo rtoinfo; struct sctp_paddrparams paddrparam; struct sctp_assocparams assocparams; /* * These two structures must be grouped together for the usercopy * whitelist region. */ __u16 subscribe; struct sctp_initmsg initmsg; int user_frag; __u32 autoclose; __u32 adaptation_ind; __u32 pd_point; __u16 nodelay:1, pf_expose:2, reuse:1, disable_fragments:1, v4mapped:1, frag_interleave:1, recvrcvinfo:1, recvnxtinfo:1, data_ready_signalled:1, cookie_auth_enable:1; atomic_t pd_mode; /* Fields after this point will be skipped on copies, like on accept * and peeloff operations */ /* Receive to here while partial delivery is in effect. */ struct sk_buff_head pd_lobby; struct list_head auto_asconf_list; int do_auto_asconf; }; #define sctp_sk(ptr) container_of_const(ptr, struct sctp_sock, inet.sk) static inline struct sock *sctp_opt2sk(const struct sctp_sock *sp) { return (struct sock *)sp; } #if IS_ENABLED(CONFIG_IPV6) struct sctp6_sock { struct sctp_sock sctp; struct ipv6_pinfo inet6; }; #endif /* CONFIG_IPV6 */ /* This is our APPLICATION-SPECIFIC state cookie. * THIS IS NOT DICTATED BY THE SPECIFICATION. */ /* These are the parts of an association which we send in the cookie. * Most of these are straight out of: * RFC2960 12.2 Parameters necessary per association (i.e. the TCB) * */ struct sctp_cookie { /* My : Tag expected in every inbound packet and sent * Verification: in the INIT or INIT ACK chunk. * Tag : */ __u32 my_vtag; /* Peer's : Tag expected in every outbound packet except * Verification: in the INIT chunk. * Tag : */ __u32 peer_vtag; /* The rest of these are not from the spec, but really need to * be in the cookie. */ /* My Tie Tag : Assist in discovering a restarting association. */ __u32 my_ttag; /* Peer's Tie Tag: Assist in discovering a restarting association. */ __u32 peer_ttag; /* When does this cookie expire? */ ktime_t expiration; /* Number of inbound/outbound streams which are set * and negotiated during the INIT process. */ __u16 sinit_num_ostreams; __u16 sinit_max_instreams; /* This is the first sequence number I used. */ __u32 initial_tsn; /* This holds the originating address of the INIT packet. */ union sctp_addr peer_addr; /* IG Section 2.35.3 * Include the source port of the INIT-ACK */ __u16 my_port; __u8 prsctp_capable; /* Padding for future use */ __u8 padding; __u32 adaptation_ind; __u8 auth_random[sizeof(struct sctp_paramhdr) + SCTP_AUTH_RANDOM_LENGTH]; __u8 auth_hmacs[SCTP_AUTH_NUM_HMACS * sizeof(__u16) + 2]; __u8 auth_chunks[sizeof(struct sctp_paramhdr) + SCTP_AUTH_MAX_CHUNKS]; /* This is a shim for my peer's INIT packet, followed by * a copy of the raw address list of the association. * The length of the raw address list is saved in the * raw_addr_list_len field, which will be used at the time when * the association TCB is re-constructed from the cookie. */ __u32 raw_addr_list_len; /* struct sctp_init_chunk peer_init[]; */ }; /* The format of our cookie that we send to our peer. */ struct sctp_signed_cookie { __u8 mac[SCTP_COOKIE_MAC_SIZE]; __u32 __pad; /* force sctp_cookie alignment to 64 bits */ struct sctp_cookie c; } __packed; /* This is another convenience type to allocate memory for address * params for the maximum size and pass such structures around * internally. */ union sctp_addr_param { struct sctp_paramhdr p; struct sctp_ipv4addr_param v4; struct sctp_ipv6addr_param v6; }; /* A convenience type to allow walking through the various * parameters and avoid casting all over the place. */ union sctp_params { void *v; struct sctp_paramhdr *p; struct sctp_cookie_preserve_param *life; struct sctp_hostname_param *dns; struct sctp_cookie_param *cookie; struct sctp_supported_addrs_param *sat; struct sctp_ipv4addr_param *v4; struct sctp_ipv6addr_param *v6; union sctp_addr_param *addr; struct sctp_adaptation_ind_param *aind; struct sctp_supported_ext_param *ext; struct sctp_random_param *random; struct sctp_chunks_param *chunks; struct sctp_hmac_algo_param *hmac_algo; struct sctp_addip_param *addip; }; /* RFC 2960. Section 3.3.5 Heartbeat. * Heartbeat Information: variable length * The Sender-specific Heartbeat Info field should normally include * information about the sender's current time when this HEARTBEAT * chunk is sent and the destination transport address to which this * HEARTBEAT is sent (see Section 8.3). */ struct sctp_sender_hb_info { struct sctp_paramhdr param_hdr; union sctp_addr daddr; unsigned long sent_at; __u64 hb_nonce; __u32 probe_size; }; int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, gfp_t gfp); int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid); void sctp_stream_free(struct sctp_stream *stream); void sctp_stream_clear(struct sctp_stream *stream); void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new); /* What is the current SSN number for this stream? */ #define sctp_ssn_peek(stream, type, sid) \ (sctp_stream_##type((stream), (sid))->ssn) /* Return the next SSN number for this stream. */ #define sctp_ssn_next(stream, type, sid) \ (sctp_stream_##type((stream), (sid))->ssn++) /* Skip over this ssn and all below. */ #define sctp_ssn_skip(stream, type, sid, ssn) \ (sctp_stream_##type((stream), (sid))->ssn = ssn + 1) /* What is the current MID number for this stream? */ #define sctp_mid_peek(stream, type, sid) \ (sctp_stream_##type((stream), (sid))->mid) /* Return the next MID number for this stream. */ #define sctp_mid_next(stream, type, sid) \ (sctp_stream_##type((stream), (sid))->mid++) /* Skip over this mid and all below. */ #define sctp_mid_skip(stream, type, sid, mid) \ (sctp_stream_##type((stream), (sid))->mid = mid + 1) /* What is the current MID_uo number for this stream? */ #define sctp_mid_uo_peek(stream, type, sid) \ (sctp_stream_##type((stream), (sid))->mid_uo) /* Return the next MID_uo number for this stream. */ #define sctp_mid_uo_next(stream, type, sid) \ (sctp_stream_##type((stream), (sid))->mid_uo++) /* * Pointers to address related SCTP functions. * (i.e. things that depend on the address family.) */ struct sctp_af { int (*sctp_xmit) (struct sk_buff *skb, struct sctp_transport *); int (*setsockopt) (struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int (*getsockopt) (struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); void (*get_dst) (struct sctp_transport *t, union sctp_addr *saddr, struct flowi *fl, struct sock *sk); void (*get_saddr) (struct sctp_sock *sk, struct sctp_transport *t, struct flowi *fl); void (*copy_addrlist) (struct list_head *, struct net_device *); int (*cmp_addr) (const union sctp_addr *addr1, const union sctp_addr *addr2); void (*addr_copy) (union sctp_addr *dst, union sctp_addr *src); void (*from_skb) (union sctp_addr *, struct sk_buff *skb, int saddr); void (*from_sk) (union sctp_addr *, struct sock *sk); bool (*from_addr_param) (union sctp_addr *, union sctp_addr_param *, __be16 port, int iif); int (*to_addr_param) (const union sctp_addr *, union sctp_addr_param *); int (*addr_valid) (union sctp_addr *, struct sctp_sock *, const struct sk_buff *); enum sctp_scope (*scope)(union sctp_addr *); void (*inaddr_any) (union sctp_addr *, __be16); int (*is_any) (const union sctp_addr *); int (*available) (union sctp_addr *, struct sctp_sock *); int (*skb_iif) (const struct sk_buff *sk); int (*skb_sdif)(const struct sk_buff *sk); int (*is_ce) (const struct sk_buff *sk); void (*seq_dump_addr)(struct seq_file *seq, union sctp_addr *addr); void (*ecn_capable)(struct sock *sk); __u16 net_header_len; int sockaddr_len; int (*ip_options_len)(struct sock *sk); sa_family_t sa_family; struct list_head list; }; struct sctp_af *sctp_get_af_specific(sa_family_t); int sctp_register_af(struct sctp_af *); /* Protocol family functions. */ struct sctp_pf { void (*event_msgname)(struct sctp_ulpevent *, char *, int *); void (*skb_msgname) (struct sk_buff *, char *, int *); int (*af_supported) (sa_family_t, struct sctp_sock *); int (*cmp_addr) (const union sctp_addr *, const union sctp_addr *, struct sctp_sock *); int (*bind_verify) (struct sctp_sock *, union sctp_addr *); int (*send_verify) (struct sctp_sock *, union sctp_addr *); int (*supported_addrs)(const struct sctp_sock *, __be16 *); struct sock *(*create_accept_sk) (struct sock *sk, struct sctp_association *asoc, bool kern); int (*addr_to_user)(struct sctp_sock *sk, union sctp_addr *addr); void (*to_sk_saddr)(union sctp_addr *, struct sock *sk); void (*to_sk_daddr)(union sctp_addr *, struct sock *sk); void (*copy_ip_options)(struct sock *sk, struct sock *newsk); struct sctp_af *af; }; /* Structure to track chunk fragments that have been acked, but peer * fragments of the same message have not. */ struct sctp_datamsg { /* Chunks waiting to be submitted to lower layer. */ struct list_head chunks; /* Reference counting. */ refcount_t refcnt; /* When is this message no longer interesting to the peer? */ unsigned long expires_at; /* Did the message fail to send? */ int send_error; u8 send_failed:1, can_delay:1, /* should this message be Nagle delayed */ abandoned:1; /* should this message be abandoned */ }; struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *, struct sctp_sndrcvinfo *, struct iov_iter *); void sctp_datamsg_free(struct sctp_datamsg *); void sctp_datamsg_put(struct sctp_datamsg *); void sctp_chunk_fail(struct sctp_chunk *, int error); int sctp_chunk_abandoned(struct sctp_chunk *); /* RFC2960 1.4 Key Terms * * o Chunk: A unit of information within an SCTP packet, consisting of * a chunk header and chunk-specific content. * * As a matter of convenience, we remember the SCTP common header for * each chunk as well as a few other header pointers... */ struct sctp_chunk { struct list_head list; refcount_t refcnt; /* How many times this chunk have been sent, for prsctp RTX policy */ int sent_count; union { /* This is our link to the per-transport transmitted list. */ struct list_head transmitted_list; /* List in specific stream outq */ struct list_head stream_list; }; /* This field is used by chunks that hold fragmented data. * For the first fragment this is the list that holds the rest of * fragments. For the remaining fragments, this is the link to the * frag_list maintained in the first fragment. */ struct list_head frag_list; /* This points to the sk_buff containing the actual data. */ struct sk_buff *skb; union { /* In case of GSO packets, this will store the head one */ struct sk_buff *head_skb; /* In case of auth enabled, this will point to the shkey */ struct sctp_shared_key *shkey; }; /* These are the SCTP headers by reverse order in a packet. * Note that some of these may happen more than once. In that * case, we point at the "current" one, whatever that means * for that level of header. */ /* We point this at the FIRST TLV parameter to chunk_hdr. */ union sctp_params param_hdr; union { __u8 *v; struct sctp_datahdr *data_hdr; struct sctp_inithdr *init_hdr; struct sctp_sackhdr *sack_hdr; struct sctp_heartbeathdr *hb_hdr; struct sctp_sender_hb_info *hbs_hdr; struct sctp_shutdownhdr *shutdown_hdr; struct sctp_signed_cookie *cookie_hdr; struct sctp_ecnehdr *ecne_hdr; struct sctp_cwrhdr *ecn_cwr_hdr; struct sctp_errhdr *err_hdr; struct sctp_addiphdr *addip_hdr; struct sctp_fwdtsn_hdr *fwdtsn_hdr; struct sctp_authhdr *auth_hdr; struct sctp_idatahdr *idata_hdr; struct sctp_ifwdtsn_hdr *ifwdtsn_hdr; } subh; __u8 *chunk_end; struct sctp_chunkhdr *chunk_hdr; struct sctphdr *sctp_hdr; /* This needs to be recoverable for SCTP_SEND_FAILED events. */ struct sctp_sndrcvinfo sinfo; /* Which association does this belong to? */ struct sctp_association *asoc; /* What endpoint received this chunk? */ struct sctp_ep_common *rcvr; /* We fill this in if we are calculating RTT. */ unsigned long sent_at; /* What is the origin IP address for this chunk? */ union sctp_addr source; /* Destination address for this chunk. */ union sctp_addr dest; /* For outbound message, track all fragments for SEND_FAILED. */ struct sctp_datamsg *msg; /* For an inbound chunk, this tells us where it came from. * For an outbound chunk, it tells us where we'd like it to * go. It is NULL if we have no preference. */ struct sctp_transport *transport; /* SCTP-AUTH: For the special case inbound processing of COOKIE-ECHO * we need save a pointer to the AUTH chunk, since the SCTP-AUTH * spec violates the principle premis that all chunks are processed * in order. */ struct sk_buff *auth_chunk; #define SCTP_CAN_FRTX 0x0 #define SCTP_NEED_FRTX 0x1 #define SCTP_DONT_FRTX 0x2 __u16 rtt_in_progress:1, /* This chunk used for RTT calc? */ has_tsn:1, /* Does this chunk have a TSN yet? */ has_ssn:1, /* Does this chunk have a SSN yet? */ #define has_mid has_ssn singleton:1, /* Only chunk in the packet? */ end_of_packet:1, /* Last chunk in the packet? */ ecn_ce_done:1, /* Have we processed the ECN CE bit? */ pdiscard:1, /* Discard the whole packet now? */ tsn_gap_acked:1, /* Is this chunk acked by a GAP ACK? */ data_accepted:1, /* At least 1 chunk accepted */ auth:1, /* IN: was auth'ed | OUT: needs auth */ has_asconf:1, /* IN: have seen an asconf before */ pmtu_probe:1, /* Used by PLPMTUD, can be set in s HB chunk */ tsn_missing_report:2, /* Data chunk missing counter. */ fast_retransmit:2; /* Is this chunk fast retransmitted? */ }; #define sctp_chunk_retransmitted(chunk) (chunk->sent_count > 1) void sctp_chunk_hold(struct sctp_chunk *); void sctp_chunk_put(struct sctp_chunk *); int sctp_user_addto_chunk(struct sctp_chunk *chunk, int len, struct iov_iter *from); void sctp_chunk_free(struct sctp_chunk *); void *sctp_addto_chunk(struct sctp_chunk *, int len, const void *data); struct sctp_chunk *sctp_chunkify(struct sk_buff *, const struct sctp_association *, struct sock *, gfp_t gfp); void sctp_init_addrs(struct sctp_chunk *, union sctp_addr *, union sctp_addr *); const union sctp_addr *sctp_source(const struct sctp_chunk *chunk); static inline __u16 sctp_chunk_stream_no(struct sctp_chunk *ch) { return ntohs(ch->subh.data_hdr->stream); } enum { SCTP_ADDR_NEW, /* new address added to assoc/ep */ SCTP_ADDR_SRC, /* address can be used as source */ SCTP_ADDR_DEL, /* address about to be deleted */ }; /* This is a structure for holding either an IPv6 or an IPv4 address. */ struct sctp_sockaddr_entry { struct list_head list; struct rcu_head rcu; union sctp_addr a; __u8 state; __u8 valid; }; #define SCTP_ADDRESS_TICK_DELAY 500 /* This structure holds lists of chunks as we are assembling for * transmission. */ struct sctp_packet { /* These are the SCTP header values (host order) for the packet. */ __u16 source_port; __u16 destination_port; __u32 vtag; /* This contains the payload chunks. */ struct list_head chunk_list; /* This is the overhead of the sctp and ip headers. */ size_t overhead; /* This is the total size of all chunks INCLUDING padding. */ size_t size; /* This is the maximum size this packet may have */ size_t max_size; /* The packet is destined for this transport address. * The function we finally use to pass down to the next lower * layer lives in the transport structure. */ struct sctp_transport *transport; /* pointer to the auth chunk for this packet */ struct sctp_chunk *auth; u8 has_cookie_echo:1, /* This packet contains a COOKIE-ECHO chunk. */ has_sack:1, /* This packet contains a SACK chunk. */ has_auth:1, /* This packet contains an AUTH chunk */ has_data:1, /* This packet contains at least 1 DATA chunk */ ipfragok:1; /* So let ip fragment this packet */ }; void sctp_packet_init(struct sctp_packet *, struct sctp_transport *, __u16 sport, __u16 dport); void sctp_packet_config(struct sctp_packet *, __u32 vtag, int); enum sctp_xmit sctp_packet_transmit_chunk(struct sctp_packet *packet, struct sctp_chunk *chunk, int one_packet, gfp_t gfp); enum sctp_xmit sctp_packet_append_chunk(struct sctp_packet *packet, struct sctp_chunk *chunk); int sctp_packet_transmit(struct sctp_packet *, gfp_t); void sctp_packet_free(struct sctp_packet *); static inline int sctp_packet_empty(struct sctp_packet *packet) { return packet->size == packet->overhead; } /* This represents a remote transport address. * For local transport addresses, we just use union sctp_addr. * * RFC2960 Section 1.4 Key Terms * * o Transport address: A Transport Address is traditionally defined * by Network Layer address, Transport Layer protocol and Transport * Layer port number. In the case of SCTP running over IP, a * transport address is defined by the combination of an IP address * and an SCTP port number (where SCTP is the Transport protocol). * * RFC2960 Section 7.1 SCTP Differences from TCP Congestion control * * o The sender keeps a separate congestion control parameter set for * each of the destination addresses it can send to (not each * source-destination pair but for each destination). The parameters * should decay if the address is not used for a long enough time * period. * */ struct sctp_transport { /* A list of transports. */ struct list_head transports; struct rhlist_head node; /* Reference counting. */ refcount_t refcnt; __u32 dead:1, /* RTO-Pending : A flag used to track if one of the DATA * chunks sent to this address is currently being * used to compute a RTT. If this flag is 0, * the next DATA chunk sent to this destination * should be used to compute a RTT and this flag * should be set. Every time the RTT * calculation completes (i.e. the DATA chunk * is SACK'd) clear this flag. */ rto_pending:1, /* * hb_sent : a flag that signals that we have a pending * heartbeat. */ hb_sent:1, /* Is the Path MTU update pending on this transport */ pmtu_pending:1, dst_pending_confirm:1, /* need to confirm neighbour */ /* Has this transport moved the ctsn since we last sacked */ sack_generation:1; u32 dst_cookie; struct flowi fl; /* This is the peer's IP address and port. */ union sctp_addr ipaddr; /* These are the functions we call to handle LLP stuff. */ struct sctp_af *af_specific; /* Which association do we belong to? */ struct sctp_association *asoc; /* RFC2960 * * 12.3 Per Transport Address Data * * For each destination transport address in the peer's * address list derived from the INIT or INIT ACK chunk, a * number of data elements needs to be maintained including: */ /* RTO : The current retransmission timeout value. */ unsigned long rto; __u32 rtt; /* This is the most recent RTT. */ /* RTTVAR : The current RTT variation. */ __u32 rttvar; /* SRTT : The current smoothed round trip time. */ __u32 srtt; /* * These are the congestion stats. */ /* cwnd : The current congestion window. */ __u32 cwnd; /* This is the actual cwnd. */ /* ssthresh : The current slow start threshold value. */ __u32 ssthresh; /* partial : The tracking method for increase of cwnd when in * bytes acked : congestion avoidance mode (see Section 6.2.2) */ __u32 partial_bytes_acked; /* Data that has been sent, but not acknowledged. */ __u32 flight_size; __u32 burst_limited; /* Holds old cwnd when max.burst is applied */ /* Destination */ struct dst_entry *dst; /* Source address. */ union sctp_addr saddr; /* Heartbeat interval: The endpoint sends out a Heartbeat chunk to * the destination address every heartbeat interval. */ unsigned long hbinterval; unsigned long probe_interval; /* SACK delay timeout */ unsigned long sackdelay; __u32 sackfreq; atomic_t mtu_info; /* When was the last time that we heard from this transport? We use * this to pick new active and retran paths. */ ktime_t last_time_heard; /* When was the last time that we sent a chunk using this * transport? We use this to check for idle transports */ unsigned long last_time_sent; /* Last time(in jiffies) when cwnd is reduced due to the congestion * indication based on ECNE chunk. */ unsigned long last_time_ecne_reduced; __be16 encap_port; /* This is the max_retrans value for the transport and will * be initialized from the assocs value. This can be changed * using the SCTP_SET_PEER_ADDR_PARAMS socket option. */ __u16 pathmaxrxt; __u32 flowlabel; __u8 dscp; /* This is the partially failed retrans value for the transport * and will be initialized from the assocs value. This can be changed * using the SCTP_PEER_ADDR_THLDS socket option */ __u16 pf_retrans; /* Used for primary path switchover. */ __u16 ps_retrans; /* PMTU : The current known path MTU. */ __u32 pathmtu; /* Flags controlling Heartbeat, SACK delay, and Path MTU Discovery. */ __u32 param_flags; /* The number of times INIT has been sent on this transport. */ int init_sent_count; /* state : The current state of this destination, * : i.e. SCTP_ACTIVE, SCTP_INACTIVE, SCTP_UNKNOWN. */ int state; /* These are the error stats for this destination. */ /* Error count : The current error count for this destination. */ unsigned short error_count; /* Per : A timer used by each destination. * Destination : * Timer : * * [Everywhere else in the text this is called T3-rtx. -ed] */ struct timer_list T3_rtx_timer; /* Heartbeat timer is per destination. */ struct timer_list hb_timer; /* Timer to handle ICMP proto unreachable envets */ struct timer_list proto_unreach_timer; /* Timer to handler reconf chunk rtx */ struct timer_list reconf_timer; /* Timer to send a probe HB packet for PLPMTUD */ struct timer_list probe_timer; /* Since we're using per-destination retransmission timers * (see above), we're also using per-destination "transmitted" * queues. This probably ought to be a private struct * accessible only within the outqueue, but it's not, yet. */ struct list_head transmitted; /* We build bundle-able packets for this transport here. */ struct sctp_packet packet; /* This is the list of transports that have chunks to send. */ struct list_head send_ready; /* State information saved for SFR_CACC algorithm. The key * idea in SFR_CACC is to maintain state at the sender on a * per-destination basis when a changeover happens. * char changeover_active; * char cycling_changeover; * __u32 next_tsn_at_change; * char cacc_saw_newack; */ struct { /* An unsigned integer, which stores the next TSN to be * used by the sender, at the moment of changeover. */ __u32 next_tsn_at_change; /* A flag which indicates the occurrence of a changeover */ char changeover_active; /* A flag which indicates whether the change of primary is * the first switch to this destination address during an * active switch. */ char cycling_changeover; /* A temporary flag, which is used during the processing of * a SACK to estimate the causative TSN(s)'s group. */ char cacc_saw_newack; } cacc; struct { __u16 pmtu; __u16 probe_size; __u16 probe_high; __u8 probe_count; __u8 state; } pl; /* plpmtud related */ /* 64-bit random number sent with heartbeat. */ __u64 hb_nonce; struct rcu_head rcu; }; struct sctp_transport *sctp_transport_new(struct net *, const union sctp_addr *, gfp_t); void sctp_transport_set_owner(struct sctp_transport *, struct sctp_association *); void sctp_transport_route(struct sctp_transport *, union sctp_addr *, struct sctp_sock *); void sctp_transport_pmtu(struct sctp_transport *, struct sock *sk); void sctp_transport_free(struct sctp_transport *); void sctp_transport_reset_t3_rtx(struct sctp_transport *); void sctp_transport_reset_hb_timer(struct sctp_transport *); void sctp_transport_reset_reconf_timer(struct sctp_transport *transport); void sctp_transport_reset_probe_timer(struct sctp_transport *transport); void sctp_transport_reset_raise_timer(struct sctp_transport *transport); int sctp_transport_hold(struct sctp_transport *); void sctp_transport_put(struct sctp_transport *); void sctp_transport_update_rto(struct sctp_transport *, __u32); void sctp_transport_raise_cwnd(struct sctp_transport *, __u32, __u32); void sctp_transport_lower_cwnd(struct sctp_transport *t, enum sctp_lower_cwnd reason); void sctp_transport_burst_limited(struct sctp_transport *); void sctp_transport_burst_reset(struct sctp_transport *); unsigned long sctp_transport_timeout(struct sctp_transport *); void sctp_transport_reset(struct sctp_transport *t); bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu); void sctp_transport_immediate_rtx(struct sctp_transport *); void sctp_transport_dst_release(struct sctp_transport *t); void sctp_transport_dst_confirm(struct sctp_transport *t); void sctp_transport_pl_send(struct sctp_transport *t); bool sctp_transport_pl_recv(struct sctp_transport *t); /* This is the structure we use to queue packets as they come into * SCTP. We write packets to it and read chunks from it. */ struct sctp_inq { /* This is actually a queue of sctp_chunk each * containing a partially decoded packet. */ struct list_head in_chunk_list; /* This is the packet which is currently off the in queue and is * being worked on through the inbound chunk processing. */ struct sctp_chunk *in_progress; /* This is the delayed task to finish delivering inbound * messages. */ struct work_struct immediate; }; void sctp_inq_init(struct sctp_inq *); void sctp_inq_free(struct sctp_inq *); void sctp_inq_push(struct sctp_inq *, struct sctp_chunk *packet); struct sctp_chunk *sctp_inq_pop(struct sctp_inq *); struct sctp_chunkhdr *sctp_inq_peek(struct sctp_inq *); void sctp_inq_set_th_handler(struct sctp_inq *, work_func_t); /* This is the structure we use to hold outbound chunks. You push * chunks in and they automatically pop out the other end as bundled * packets (it calls (*output_handler)()). * * This structure covers sections 6.3, 6.4, 6.7, 6.8, 6.10, 7., 8.1, * and 8.2 of the v13 draft. * * It handles retransmissions. The connection to the timeout portion * of the state machine is through sctp_..._timeout() and timeout_handler. * * If you feed it SACKs, it will eat them. * * If you give it big chunks, it will fragment them. * * It assigns TSN's to data chunks. This happens at the last possible * instant before transmission. * * When free()'d, it empties itself out via output_handler(). */ struct sctp_outq { struct sctp_association *asoc; /* Data pending that has never been transmitted. */ struct list_head out_chunk_list; /* Stream scheduler being used */ struct sctp_sched_ops *sched; unsigned int out_qlen; /* Total length of queued data chunks. */ /* Error of send failed, may used in SCTP_SEND_FAILED event. */ unsigned int error; /* These are control chunks we want to send. */ struct list_head control_chunk_list; /* These are chunks that have been sacked but are above the * CTSN, or cumulative tsn ack point. */ struct list_head sacked; /* Put chunks on this list to schedule them for * retransmission. */ struct list_head retransmit; /* Put chunks on this list to save them for FWD TSN processing as * they were abandoned. */ struct list_head abandoned; /* How many unackd bytes do we have in-flight? */ __u32 outstanding_bytes; /* Are we doing fast-rtx on this queue */ char fast_rtx; /* Corked? */ char cork; }; void sctp_outq_init(struct sctp_association *, struct sctp_outq *); void sctp_outq_teardown(struct sctp_outq *); void sctp_outq_free(struct sctp_outq*); void sctp_outq_tail(struct sctp_outq *, struct sctp_chunk *chunk, gfp_t); int sctp_outq_sack(struct sctp_outq *, struct sctp_chunk *); int sctp_outq_is_empty(const struct sctp_outq *); void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport, enum sctp_retransmit_reason reason); void sctp_retransmit_mark(struct sctp_outq *, struct sctp_transport *, __u8); void sctp_outq_uncork(struct sctp_outq *, gfp_t gfp); void sctp_prsctp_prune(struct sctp_association *asoc, struct sctp_sndrcvinfo *sinfo, int msg_len); void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn); /* Uncork and flush an outqueue. */ static inline void sctp_outq_cork(struct sctp_outq *q) { q->cork = 1; } /* SCTP skb control block. * sctp_input_cb is currently used on rx and sock rx queue */ struct sctp_input_cb { union { struct inet_skb_parm h4; #if IS_ENABLED(CONFIG_IPV6) struct inet6_skb_parm h6; #endif } header; struct sctp_chunk *chunk; struct sctp_af *af; __be16 encap_port; }; #define SCTP_INPUT_CB(__skb) ((struct sctp_input_cb *)&((__skb)->cb[0])) struct sctp_output_cb { struct sk_buff *last; }; #define SCTP_OUTPUT_CB(__skb) ((struct sctp_output_cb *)&((__skb)->cb[0])) static inline const struct sk_buff *sctp_gso_headskb(const struct sk_buff *skb) { const struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk; return chunk->head_skb ? : skb; } /* These bind address data fields common between endpoints and associations */ struct sctp_bind_addr { /* RFC 2960 12.1 Parameters necessary for the SCTP instance * * SCTP Port: The local SCTP port number the endpoint is * bound to. */ __u16 port; /* RFC 2960 12.1 Parameters necessary for the SCTP instance * * Address List: The list of IP addresses that this instance * has bound. This information is passed to one's * peer(s) in INIT and INIT ACK chunks. */ struct list_head address_list; }; void sctp_bind_addr_init(struct sctp_bind_addr *, __u16 port); void sctp_bind_addr_free(struct sctp_bind_addr *); int sctp_bind_addr_copy(struct net *net, struct sctp_bind_addr *dest, const struct sctp_bind_addr *src, enum sctp_scope scope, gfp_t gfp, int flags); int sctp_bind_addr_dup(struct sctp_bind_addr *dest, const struct sctp_bind_addr *src, gfp_t gfp); int sctp_add_bind_addr(struct sctp_bind_addr *, union sctp_addr *, int new_size, __u8 addr_state, gfp_t gfp); int sctp_del_bind_addr(struct sctp_bind_addr *, union sctp_addr *); int sctp_bind_addr_match(struct sctp_bind_addr *, const union sctp_addr *, struct sctp_sock *); int sctp_bind_addr_conflict(struct sctp_bind_addr *, const union sctp_addr *, struct sctp_sock *, struct sctp_sock *); int sctp_bind_addr_state(const struct sctp_bind_addr *bp, const union sctp_addr *addr); int sctp_bind_addrs_check(struct sctp_sock *sp, struct sctp_sock *sp2, int cnt2); union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr *bp, const union sctp_addr *addrs, int addrcnt, struct sctp_sock *opt); union sctp_params sctp_bind_addrs_to_raw(const struct sctp_bind_addr *bp, int *addrs_len, gfp_t gfp); int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw, int len, __u16 port, gfp_t gfp); enum sctp_scope sctp_scope(const union sctp_addr *addr); int sctp_in_scope(struct net *net, const union sctp_addr *addr, const enum sctp_scope scope); int sctp_is_any(struct sock *sk, const union sctp_addr *addr); int sctp_is_ep_boundall(struct sock *sk); /* What type of endpoint? */ enum sctp_endpoint_type { SCTP_EP_TYPE_SOCKET, SCTP_EP_TYPE_ASSOCIATION, }; /* * A common base class to bridge the implementation view of a * socket (usually listening) endpoint versus an association's * local endpoint. * This common structure is useful for several purposes: * 1) Common interface for lookup routines. * a) Subfunctions work for either endpoint or association * b) Single interface to lookup allows hiding the lookup lock rather * than acquiring it externally. * 2) Common interface for the inbound chunk handling/state machine. * 3) Common object handling routines for reference counting, etc. * 4) Disentangle association lookup from endpoint lookup, where we * do not have to find our endpoint to find our association. * */ struct sctp_ep_common { /* Runtime type information. What kind of endpoint is this? */ enum sctp_endpoint_type type; /* Some fields to help us manage this object. * refcnt - Reference count access to this object. * dead - Do not attempt to use this object. */ refcount_t refcnt; bool dead; /* What socket does this endpoint belong to? */ struct sock *sk; /* Cache netns and it won't change once set */ struct net *net; /* This is where we receive inbound chunks. */ struct sctp_inq inqueue; /* This substructure includes the defining parameters of the * endpoint: * bind_addr.port is our shared port number. * bind_addr.address_list is our set of local IP addresses. */ struct sctp_bind_addr bind_addr; }; /* RFC Section 1.4 Key Terms * * o SCTP endpoint: The logical sender/receiver of SCTP packets. On a * multi-homed host, an SCTP endpoint is represented to its peers as a * combination of a set of eligible destination transport addresses to * which SCTP packets can be sent and a set of eligible source * transport addresses from which SCTP packets can be received. * All transport addresses used by an SCTP endpoint must use the * same port number, but can use multiple IP addresses. A transport * address used by an SCTP endpoint must not be used by another * SCTP endpoint. In other words, a transport address is unique * to an SCTP endpoint. * * From an implementation perspective, each socket has one of these. * A TCP-style socket will have exactly one association on one of * these. An UDP-style socket will have multiple associations hanging * off one of these. */ struct sctp_endpoint { /* Common substructure for endpoint and association. */ struct sctp_ep_common base; /* Fields to help us manage our entries in the hash tables. */ struct hlist_node node; int hashent; /* Associations: A list of current associations and mappings * to the data consumers for each association. This * may be in the form of a hash table or other * implementation dependent structure. The data * consumers may be process identification * information such as file descriptors, named pipe * pointer, or table pointers dependent on how SCTP * is implemented. */ /* This is really a list of struct sctp_association entries. */ struct list_head asocs; /* Cookie authentication key used by this endpoint */ struct hmac_sha256_key cookie_auth_key; /* sendbuf acct. policy. */ __u32 sndbuf_policy; /* rcvbuf acct. policy. */ __u32 rcvbuf_policy; /* SCTP-AUTH: hmacs for the endpoint encoded into parameter */ struct sctp_hmac_algo_param *auth_hmacs_list; /* SCTP-AUTH: chunks to authenticate encoded into parameter */ struct sctp_chunks_param *auth_chunk_list; /* SCTP-AUTH: endpoint shared keys */ struct list_head endpoint_shared_keys; __u16 active_key_id; __u8 ecn_enable:1, auth_enable:1, intl_enable:1, prsctp_enable:1, asconf_enable:1, reconf_enable:1; __u8 strreset_enable; struct rcu_head rcu; }; /* Recover the outer endpoint structure. */ static inline struct sctp_endpoint *sctp_ep(struct sctp_ep_common *base) { struct sctp_endpoint *ep; ep = container_of(base, struct sctp_endpoint, base); return ep; } /* These are function signatures for manipulating endpoints. */ struct sctp_endpoint *sctp_endpoint_new(struct sock *, gfp_t); void sctp_endpoint_free(struct sctp_endpoint *); void sctp_endpoint_put(struct sctp_endpoint *); int sctp_endpoint_hold(struct sctp_endpoint *ep); void sctp_endpoint_add_asoc(struct sctp_endpoint *, struct sctp_association *); struct sctp_association *sctp_endpoint_lookup_assoc( const struct sctp_endpoint *ep, const union sctp_addr *paddr, struct sctp_transport **); bool sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep, const union sctp_addr *paddr); struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *ep, struct net *net, const union sctp_addr *laddr, int dif, int sdif); bool sctp_has_association(struct net *net, const union sctp_addr *laddr, const union sctp_addr *paddr, int dif, int sdif); int sctp_verify_init(struct net *net, const struct sctp_endpoint *ep, const struct sctp_association *asoc, enum sctp_cid cid, struct sctp_init_chunk *peer_init, struct sctp_chunk *chunk, struct sctp_chunk **err_chunk); int sctp_process_init(struct sctp_association *, struct sctp_chunk *chunk, const union sctp_addr *peer, struct sctp_init_chunk *init, gfp_t gfp); __u32 sctp_generate_tag(const struct sctp_endpoint *); __u32 sctp_generate_tsn(const struct sctp_endpoint *); struct sctp_inithdr_host { __u32 init_tag; __u32 a_rwnd; __u16 num_outbound_streams; __u16 num_inbound_streams; __u32 initial_tsn; }; struct sctp_stream_priorities { /* List of priorities scheduled */ struct list_head prio_sched; /* List of streams scheduled */ struct list_head active; /* The next stream in line */ struct sctp_stream_out_ext *next; __u16 prio; __u16 users; }; struct sctp_stream_out_ext { __u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1]; __u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1]; struct list_head outq; /* chunks enqueued by this stream */ union { struct { /* Scheduled streams list */ struct list_head prio_list; struct sctp_stream_priorities *prio_head; }; /* Fields used by RR scheduler */ struct { struct list_head rr_list; }; struct { struct list_head fc_list; __u32 fc_length; __u16 fc_weight; }; }; }; struct sctp_stream_out { union { __u32 mid; __u16 ssn; }; __u32 mid_uo; struct sctp_stream_out_ext *ext; __u8 state; }; struct sctp_stream_in { union { __u32 mid; __u16 ssn; }; __u32 mid_uo; __u32 fsn; __u32 fsn_uo; char pd_mode; char pd_mode_uo; }; struct sctp_stream { GENRADIX(struct sctp_stream_out) out; GENRADIX(struct sctp_stream_in) in; __u16 outcnt; __u16 incnt; /* Current stream being sent, if any */ struct sctp_stream_out *out_curr; union { /* Fields used by priority scheduler */ struct { /* List of priorities scheduled */ struct list_head prio_list; }; /* Fields used by RR scheduler */ struct { /* List of streams scheduled */ struct list_head rr_list; /* The next stream in line */ struct sctp_stream_out_ext *rr_next; }; struct { struct list_head fc_list; }; }; struct sctp_stream_interleave *si; }; static inline struct sctp_stream_out *sctp_stream_out( struct sctp_stream *stream, __u16 sid) { return genradix_ptr(&stream->out, sid); } static inline struct sctp_stream_in *sctp_stream_in( struct sctp_stream *stream, __u16 sid) { return genradix_ptr(&stream->in, sid); } #define SCTP_SO(s, i) sctp_stream_out((s), (i)) #define SCTP_SI(s, i) sctp_stream_in((s), (i)) #define SCTP_STREAM_CLOSED 0x00 #define SCTP_STREAM_OPEN 0x01 static inline __u16 sctp_datachk_len(const struct sctp_stream *stream) { return stream->si->data_chunk_len; } static inline __u16 sctp_datahdr_len(const struct sctp_stream *stream) { return stream->si->data_chunk_len - sizeof(struct sctp_chunkhdr); } static inline __u16 sctp_ftsnchk_len(const struct sctp_stream *stream) { return stream->si->ftsn_chunk_len; } static inline __u16 sctp_ftsnhdr_len(const struct sctp_stream *stream) { return stream->si->ftsn_chunk_len - sizeof(struct sctp_chunkhdr); } /* SCTP_GET_ASSOC_STATS counters */ struct sctp_priv_assoc_stats { /* Maximum observed rto in the association during subsequent * observations. Value is set to 0 if no RTO measurement took place * The transport where the max_rto was observed is returned in * obs_rto_ipaddr */ struct sockaddr_storage obs_rto_ipaddr; __u64 max_obs_rto; /* Total In and Out SACKs received and sent */ __u64 isacks; __u64 osacks; /* Total In and Out packets received and sent */ __u64 opackets; __u64 ipackets; /* Total retransmitted chunks */ __u64 rtxchunks; /* TSN received > next expected */ __u64 outofseqtsns; /* Duplicate Chunks received */ __u64 idupchunks; /* Gap Ack Blocks received */ __u64 gapcnt; /* Unordered data chunks sent and received */ __u64 ouodchunks; __u64 iuodchunks; /* Ordered data chunks sent and received */ __u64 oodchunks; __u64 iodchunks; /* Control chunks sent and received */ __u64 octrlchunks; __u64 ictrlchunks; }; /* RFC2960 * * 12. Recommended Transmission Control Block (TCB) Parameters * * This section details a recommended set of parameters that should * be contained within the TCB for an implementation. This section is * for illustrative purposes and should not be deemed as requirements * on an implementation or as an exhaustive list of all parameters * inside an SCTP TCB. Each implementation may need its own additional * parameters for optimization. */ /* Here we have information about each individual association. */ struct sctp_association { /* A base structure common to endpoint and association. * In this context, it represents the associations's view * of the local endpoint of the association. */ struct sctp_ep_common base; /* Associations on the same socket. */ struct list_head asocs; /* association id. */ sctp_assoc_t assoc_id; /* This is our parent endpoint. */ struct sctp_endpoint *ep; /* These are those association elements needed in the cookie. */ struct sctp_cookie c; /* This is all information about our peer. */ struct { /* transport_addr_list * * Peer : A list of SCTP transport addresses that the * Transport : peer is bound to. This information is derived * Address : from the INIT or INIT ACK and is used to * List : associate an inbound packet with a given * : association. Normally this information is * : hashed or keyed for quick lookup and access * : of the TCB. * : The list is also initialized with the list * : of addresses passed with the sctp_connectx() * : call. * * It is a list of SCTP_transport's. */ struct list_head transport_addr_list; /* rwnd * * Peer Rwnd : Current calculated value of the peer's rwnd. */ __u32 rwnd; /* transport_count * * Peer : A count of the number of peer addresses * Transport : in the Peer Transport Address List. * Address : * Count : */ __u16 transport_count; /* port * The transport layer port number. */ __u16 port; /* primary_path * * Primary : This is the current primary destination * Path : transport address of the peer endpoint. It * : may also specify a source transport address * : on this endpoint. * * All of these paths live on transport_addr_list. * * At the bakeoffs, we discovered that the intent of * primaryPath is that it only changes when the ULP * asks to have it changed. We add the activePath to * designate the connection we are currently using to * transmit new data and most control chunks. */ struct sctp_transport *primary_path; /* Cache the primary path address here, when we * need a an address for msg_name. */ union sctp_addr primary_addr; /* active_path * The path that we are currently using to * transmit new data and most control chunks. */ struct sctp_transport *active_path; /* retran_path * * RFC2960 6.4 Multi-homed SCTP Endpoints * ... * Furthermore, when its peer is multi-homed, an * endpoint SHOULD try to retransmit a chunk to an * active destination transport address that is * different from the last destination address to * which the DATA chunk was sent. */ struct sctp_transport *retran_path; /* Pointer to last transport I have sent on. */ struct sctp_transport *last_sent_to; /* This is the last transport I have received DATA on. */ struct sctp_transport *last_data_from; /* * Mapping An array of bits or bytes indicating which out of * Array order TSN's have been received (relative to the * Last Rcvd TSN). If no gaps exist, i.e. no out of * order packets have been received, this array * will be set to all zero. This structure may be * in the form of a circular buffer or bit array. * * Last Rcvd : This is the last TSN received in * TSN : sequence. This value is set initially by * : taking the peer's Initial TSN, received in * : the INIT or INIT ACK chunk, and subtracting * : one from it. * * Throughout most of the specification this is called the * "Cumulative TSN ACK Point". In this case, we * ignore the advice in 12.2 in favour of the term * used in the bulk of the text. This value is hidden * in tsn_map--we get it by calling sctp_tsnmap_get_ctsn(). */ struct sctp_tsnmap tsn_map; /* This mask is used to disable sending the ASCONF chunk * with specified parameter to peer. */ __be16 addip_disabled_mask; /* These are capabilities which our peer advertised. */ __u16 ecn_capable:1, /* Can peer do ECN? */ ipv4_address:1, /* Peer understands IPv4 addresses? */ ipv6_address:1, /* Peer understands IPv6 addresses? */ asconf_capable:1, /* Does peer support ADDIP? */ prsctp_capable:1, /* Can peer do PR-SCTP? */ reconf_capable:1, /* Can peer do RE-CONFIG? */ intl_capable:1, /* Can peer do INTERLEAVE */ auth_capable:1, /* Is peer doing SCTP-AUTH? */ /* sack_needed: * This flag indicates if the next received * packet is to be responded to with a * SACK. This is initialized to 0. When a packet * is received sack_cnt is incremented. If this value * reaches 2 or more, a SACK is sent and the * value is reset to 0. Note: This is used only * when no DATA chunks are received out of * order. When DATA chunks are out of order, * SACK's are not delayed (see Section 6). */ sack_needed:1, /* Do we need to sack the peer? */ sack_generation:1, zero_window_announced:1; __u32 sack_cnt; __u32 adaptation_ind; /* Adaptation Code point. */ struct sctp_inithdr_host i; void *cookie; int cookie_len; /* ADDIP Section 4.2 Upon reception of an ASCONF Chunk. * C1) ... "Peer-Serial-Number'. This value MUST be initialized to the * Initial TSN Value minus 1 */ __u32 addip_serial; /* SCTP-AUTH: We need to know pears random number, hmac list * and authenticated chunk list. All that is part of the * cookie and these are just pointers to those locations */ struct sctp_random_param *peer_random; struct sctp_chunks_param *peer_chunks; struct sctp_hmac_algo_param *peer_hmacs; } peer; /* State : A state variable indicating what state the * : association is in, i.e. COOKIE-WAIT, * : COOKIE-ECHOED, ESTABLISHED, SHUTDOWN-PENDING, * : SHUTDOWN-SENT, SHUTDOWN-RECEIVED, SHUTDOWN-ACK-SENT. * * Note: No "CLOSED" state is illustrated since if a * association is "CLOSED" its TCB SHOULD be removed. * * In this implementation we DO have a CLOSED * state which is used during initiation and shutdown. * * State takes values from SCTP_STATE_*. */ enum sctp_state state; /* Overall : The overall association error count. * Error Count : [Clear this any time I get something.] */ int overall_error_count; /* The cookie life I award for any cookie. */ ktime_t cookie_life; /* These are the association's initial, max, and min RTO values. * These values will be initialized by system defaults, but can * be modified via the SCTP_RTOINFO socket option. */ unsigned long rto_initial; unsigned long rto_max; unsigned long rto_min; /* Maximum number of new data packets that can be sent in a burst. */ int max_burst; /* This is the max_retrans value for the association. This value will * be initialized from system defaults, but can be * modified by the SCTP_ASSOCINFO socket option. */ int max_retrans; /* This is the partially failed retrans value for the transport * and will be initialized from the assocs value. This can be * changed using the SCTP_PEER_ADDR_THLDS socket option */ __u16 pf_retrans; /* Used for primary path switchover. */ __u16 ps_retrans; /* Maximum number of times the endpoint will retransmit INIT */ __u16 max_init_attempts; /* How many times have we resent an INIT? */ __u16 init_retries; /* The largest timeout or RTO value to use in attempting an INIT */ unsigned long max_init_timeo; /* Heartbeat interval: The endpoint sends out a Heartbeat chunk to * the destination address every heartbeat interval. This value * will be inherited by all new transports. */ unsigned long hbinterval; unsigned long probe_interval; __be16 encap_port; /* This is the max_retrans value for new transports in the * association. */ __u16 pathmaxrxt; __u32 flowlabel; __u8 dscp; /* Flag that path mtu update is pending */ __u8 pmtu_pending; /* Association : The smallest PMTU discovered for all of the * PMTU : peer's transport addresses. */ __u32 pathmtu; /* Flags controlling Heartbeat, SACK delay, and Path MTU Discovery. */ __u32 param_flags; __u32 sackfreq; /* SACK delay timeout */ unsigned long sackdelay; unsigned long timeouts[SCTP_NUM_TIMEOUT_TYPES]; struct timer_list timers[SCTP_NUM_TIMEOUT_TYPES]; /* Transport to which SHUTDOWN chunk was last sent. */ struct sctp_transport *shutdown_last_sent_to; /* Transport to which INIT chunk was last sent. */ struct sctp_transport *init_last_sent_to; /* How many times have we resent a SHUTDOWN */ int shutdown_retries; /* Next TSN : The next TSN number to be assigned to a new * : DATA chunk. This is sent in the INIT or INIT * : ACK chunk to the peer and incremented each * : time a DATA chunk is assigned a TSN * : (normally just prior to transmit or during * : fragmentation). */ __u32 next_tsn; /* * Last Rcvd : This is the last TSN received in sequence. This value * TSN : is set initially by taking the peer's Initial TSN, * : received in the INIT or INIT ACK chunk, and * : subtracting one from it. * * Most of RFC 2960 refers to this as the Cumulative TSN Ack Point. */ __u32 ctsn_ack_point; /* PR-SCTP Advanced.Peer.Ack.Point */ __u32 adv_peer_ack_point; /* Highest TSN that is acknowledged by incoming SACKs. */ __u32 highest_sacked; /* TSN marking the fast recovery exit point */ __u32 fast_recovery_exit; /* Flag to track the current fast recovery state */ __u8 fast_recovery; /* The number of unacknowledged data chunks. Reported through * the SCTP_STATUS sockopt. */ __u16 unack_data; /* The total number of data chunks that we've had to retransmit * as the result of a T3 timer expiration */ __u32 rtx_data_chunks; /* This is the association's receive buffer space. This value is used * to set a_rwnd field in an INIT or a SACK chunk. */ __u32 rwnd; /* This is the last advertised value of rwnd over a SACK chunk. */ __u32 a_rwnd; /* Number of bytes by which the rwnd has slopped. The rwnd is allowed * to slop over a maximum of the association's frag_point. */ __u32 rwnd_over; /* Keeps treack of rwnd pressure. This happens when we have * a window, but not receive buffer (i.e small packets). This one * is releases slowly (1 PMTU at a time ). */ __u32 rwnd_press; /* This is the sndbuf size in use for the association. * This corresponds to the sndbuf size for the association, * as specified in the sk->sndbuf. */ int sndbuf_used; /* This is the amount of memory that this association has allocated * in the receive path at any given time. */ atomic_t rmem_alloc; /* This is the wait queue head for send requests waiting on * the association sndbuf space. */ wait_queue_head_t wait; /* The message size at which SCTP fragmentation will occur. */ __u32 frag_point; __u32 user_frag; /* Counter used to count INIT errors. */ int init_err_counter; /* Count the number of INIT cycles (for doubling timeout). */ int init_cycle; /* Default send parameters. */ __u16 default_stream; __u16 default_flags; __u32 default_ppid; __u32 default_context; __u32 default_timetolive; /* Default receive parameters */ __u32 default_rcv_context; /* Stream arrays */ struct sctp_stream stream; /* All outbound chunks go through this structure. */ struct sctp_outq outqueue; /* A smart pipe that will handle reordering and fragmentation, * as well as handle passing events up to the ULP. */ struct sctp_ulpq ulpq; /* Last TSN that caused an ECNE Chunk to be sent. */ __u32 last_ecne_tsn; /* Last TSN that caused a CWR Chunk to be sent. */ __u32 last_cwr_tsn; /* How many duplicated TSNs have we seen? */ int numduptsns; /* These are to support * "SCTP Extensions for Dynamic Reconfiguration of IP Addresses * and Enforcement of Flow and Message Limits" * <draft-ietf-tsvwg-addip-sctp-02.txt> * or "ADDIP" for short. */ /* ADDIP Section 4.1.1 Congestion Control of ASCONF Chunks * * R1) One and only one ASCONF Chunk MAY be in transit and * unacknowledged at any one time. If a sender, after sending * an ASCONF chunk, decides it needs to transfer another * ASCONF Chunk, it MUST wait until the ASCONF-ACK Chunk * returns from the previous ASCONF Chunk before sending a * subsequent ASCONF. Note this restriction binds each side, * so at any time two ASCONF may be in-transit on any given * association (one sent from each endpoint). * * [This is our one-and-only-one ASCONF in flight. If we do * not have an ASCONF in flight, this is NULL.] */ struct sctp_chunk *addip_last_asconf; /* ADDIP Section 5.2 Upon reception of an ASCONF Chunk. * * This is needed to implement items E1 - E4 of the updated * spec. Here is the justification: * * Since the peer may bundle multiple ASCONF chunks toward us, * we now need the ability to cache multiple ACKs. The section * describes in detail how they are cached and cleaned up. */ struct list_head asconf_ack_list; /* These ASCONF chunks are waiting to be sent. * * These chunks can't be pushed to outqueue until receiving * ASCONF_ACK for the previous ASCONF indicated by * addip_last_asconf, so as to guarantee that only one ASCONF * is in flight at any time. * * ADDIP Section 4.1.1 Congestion Control of ASCONF Chunks * * In defining the ASCONF Chunk transfer procedures, it is * essential that these transfers MUST NOT cause congestion * within the network. To achieve this, we place these * restrictions on the transfer of ASCONF Chunks: * * R1) One and only one ASCONF Chunk MAY be in transit and * unacknowledged at any one time. If a sender, after sending * an ASCONF chunk, decides it needs to transfer another * ASCONF Chunk, it MUST wait until the ASCONF-ACK Chunk * returns from the previous ASCONF Chunk before sending a * subsequent ASCONF. Note this restriction binds each side, * so at any time two ASCONF may be in-transit on any given * association (one sent from each endpoint). * * * [I really think this is EXACTLY the sort of intelligence * which already resides in sctp_outq. Please move this * queue and its supporting logic down there. --piggy] */ struct list_head addip_chunk_list; /* ADDIP Section 4.1 ASCONF Chunk Procedures * * A2) A serial number should be assigned to the Chunk. The * serial number SHOULD be a monotonically increasing * number. The serial number SHOULD be initialized at * the start of the association to the same value as the * Initial TSN and every time a new ASCONF chunk is created * it is incremented by one after assigning the serial number * to the newly created chunk. * * ADDIP * 3.1.1 Address/Stream Configuration Change Chunk (ASCONF) * * Serial Number : 32 bits (unsigned integer) * * This value represents a Serial Number for the ASCONF * Chunk. The valid range of Serial Number is from 0 to * 4294967295 (2^32 - 1). Serial Numbers wrap back to 0 * after reaching 4294967295. */ __u32 addip_serial; int src_out_of_asoc_ok; union sctp_addr *asconf_addr_del_pending; struct sctp_transport *new_transport; /* SCTP AUTH: list of the endpoint shared keys. These * keys are provided out of band by the user application * and can't change during the lifetime of the association */ struct list_head endpoint_shared_keys; /* SCTP AUTH: * The current generated association shared key (secret) */ struct sctp_auth_bytes *asoc_shared_key; struct sctp_shared_key *shkey; /* SCTP AUTH: hmac id of the first peer requested algorithm * that we support. */ __u16 default_hmac_id; __u16 active_key_id; __u8 need_ecne:1, /* Need to send an ECNE Chunk? */ temp:1, /* Is it a temporary association? */ pf_expose:2, /* Expose pf state? */ force_delay:1; __u8 strreset_enable; __u8 strreset_outstanding; /* request param count on the fly */ __u32 strreset_outseq; /* Update after receiving response */ __u32 strreset_inseq; /* Update after receiving request */ __u32 strreset_result[2]; /* save the results of last 2 responses */ struct sctp_chunk *strreset_chunk; /* save request chunk */ struct sctp_priv_assoc_stats stats; int sent_cnt_removable; __u16 subscribe; __u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1]; __u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1]; /* Security identifiers from incoming (INIT). These are set by * security_sctp_assoc_request(). These will only be used by * SCTP TCP type sockets and peeled off connections as they * cause a new socket to be generated. security_sctp_sk_clone() * will then plug these into the new socket. */ u32 secid; u32 peer_secid; struct rcu_head rcu; }; /* An eyecatcher for determining if we are really looking at an * association data structure. */ enum { SCTP_ASSOC_EYECATCHER = 0xa550c123, }; /* Recover the outer association structure. */ static inline struct sctp_association *sctp_assoc(struct sctp_ep_common *base) { struct sctp_association *asoc; asoc = container_of(base, struct sctp_association, base); return asoc; } /* These are function signatures for manipulating associations. */ struct sctp_association * sctp_association_new(const struct sctp_endpoint *ep, const struct sock *sk, enum sctp_scope scope, gfp_t gfp); void sctp_association_free(struct sctp_association *); void sctp_association_put(struct sctp_association *); void sctp_association_hold(struct sctp_association *); struct sctp_transport *sctp_assoc_choose_alter_transport( struct sctp_association *, struct sctp_transport *); void sctp_assoc_update_retran_path(struct sctp_association *); struct sctp_transport *sctp_assoc_lookup_paddr(const struct sctp_association *, const union sctp_addr *); int sctp_assoc_lookup_laddr(struct sctp_association *asoc, const union sctp_addr *laddr); struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *, const union sctp_addr *address, const gfp_t gfp, const int peer_state); void sctp_assoc_rm_peer(struct sctp_association *asoc, struct sctp_transport *peer); void sctp_assoc_control_transport(struct sctp_association *asoc, struct sctp_transport *transport, enum sctp_transport_cmd command, sctp_sn_error_t error); struct sctp_transport *sctp_assoc_lookup_tsn(struct sctp_association *, __u32); void sctp_assoc_migrate(struct sctp_association *, struct sock *); int sctp_assoc_update(struct sctp_association *old, struct sctp_association *new); __u32 sctp_association_get_next_tsn(struct sctp_association *); void sctp_assoc_update_frag_point(struct sctp_association *asoc); void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu); void sctp_assoc_sync_pmtu(struct sctp_association *asoc); void sctp_assoc_rwnd_increase(struct sctp_association *, unsigned int); void sctp_assoc_rwnd_decrease(struct sctp_association *, unsigned int); void sctp_assoc_set_primary(struct sctp_association *, struct sctp_transport *); void sctp_assoc_del_nonprimary_peers(struct sctp_association *, struct sctp_transport *); int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc, enum sctp_scope scope, gfp_t gfp); int sctp_assoc_set_bind_addr_from_cookie(struct sctp_association *, struct sctp_cookie*, gfp_t gfp); int sctp_assoc_set_id(struct sctp_association *, gfp_t); void sctp_assoc_clean_asconf_ack_cache(const struct sctp_association *asoc); struct sctp_chunk *sctp_assoc_lookup_asconf_ack( const struct sctp_association *asoc, __be32 serial); void sctp_asconf_queue_teardown(struct sctp_association *asoc); int sctp_cmp_addr_exact(const union sctp_addr *ss1, const union sctp_addr *ss2); struct sctp_chunk *sctp_get_ecne_prepend(struct sctp_association *asoc); /* A convenience structure to parse out SCTP specific CMSGs. */ struct sctp_cmsgs { struct sctp_initmsg *init; struct sctp_sndrcvinfo *srinfo; struct sctp_sndinfo *sinfo; struct sctp_prinfo *prinfo; struct sctp_authinfo *authinfo; struct msghdr *addrs_msg; }; /* Structure for tracking memory objects */ struct sctp_dbg_objcnt_entry { char *label; atomic_t *counter; }; #endif /* __sctp_structs_h__ */
4711 1184 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2001-2003 Patrick Mochel <mochel@osdl.org> * Copyright (c) 2004-2009 Greg Kroah-Hartman <gregkh@suse.de> * Copyright (c) 2008-2012 Novell Inc. * Copyright (c) 2012-2019 Greg Kroah-Hartman <gregkh@linuxfoundation.org> * Copyright (c) 2012-2019 Linux Foundation * * Core driver model functions and structures that should not be * shared outside of the drivers/base/ directory. * */ #include <linux/notifier.h> /** * struct subsys_private - structure to hold the private to the driver core portions of the bus_type/class structure. * * @subsys - the struct kset that defines this subsystem * @devices_kset - the subsystem's 'devices' directory * @interfaces - list of subsystem interfaces associated * @mutex - protect the devices, and interfaces lists. * * @drivers_kset - the list of drivers associated * @klist_devices - the klist to iterate over the @devices_kset * @klist_drivers - the klist to iterate over the @drivers_kset * @bus_notifier - the bus notifier list for anything that cares about things * on this bus. * @bus - pointer back to the struct bus_type that this structure is associated * with. * @dev_root: Default device to use as the parent. * * @glue_dirs - "glue" directory to put in-between the parent device to * avoid namespace conflicts * @class - pointer back to the struct class that this structure is associated * with. * @lock_key: Lock class key for use by the lock validator * * This structure is the one that is the actual kobject allowing struct * bus_type/class to be statically allocated safely. Nothing outside of the * driver core should ever touch these fields. */ struct subsys_private { struct kset subsys; struct kset *devices_kset; struct list_head interfaces; struct mutex mutex; struct kset *drivers_kset; struct klist klist_devices; struct klist klist_drivers; struct blocking_notifier_head bus_notifier; unsigned int drivers_autoprobe:1; const struct bus_type *bus; struct device *dev_root; struct kset glue_dirs; const struct class *class; struct lock_class_key lock_key; }; #define to_subsys_private(obj) container_of_const(obj, struct subsys_private, subsys.kobj) static inline struct subsys_private *subsys_get(struct subsys_private *sp) { if (sp) kset_get(&sp->subsys); return sp; } static inline void subsys_put(struct subsys_private *sp) { if (sp) kset_put(&sp->subsys); } struct subsys_private *bus_to_subsys(const struct bus_type *bus); struct subsys_private *class_to_subsys(const struct class *class); struct driver_private { struct kobject kobj; struct klist klist_devices; struct klist_node knode_bus; struct module_kobject *mkobj; struct device_driver *driver; }; #define to_driver(obj) container_of(obj, struct driver_private, kobj) /** * struct device_private - structure to hold the private to the driver core portions of the device structure. * * @klist_children - klist containing all children of this device * @knode_parent - node in sibling list * @knode_driver - node in driver list * @knode_bus - node in bus list * @knode_class - node in class list * @deferred_probe - entry in deferred_probe_list which is used to retry the * binding of drivers which were unable to get all the resources needed by * the device; typically because it depends on another driver getting * probed first. * @async_driver - pointer to device driver awaiting probe via async_probe * @device - pointer back to the struct device that this structure is * associated with. * @dead - This device is currently either in the process of or has been * removed from the system. Any asynchronous events scheduled for this * device should exit without taking any action. * * Nothing outside of the driver core should ever touch these fields. */ struct device_private { struct klist klist_children; struct klist_node knode_parent; struct klist_node knode_driver; struct klist_node knode_bus; struct klist_node knode_class; struct list_head deferred_probe; const struct device_driver *async_driver; char *deferred_probe_reason; struct device *device; u8 dead:1; }; #define to_device_private_parent(obj) \ container_of(obj, struct device_private, knode_parent) #define to_device_private_driver(obj) \ container_of(obj, struct device_private, knode_driver) #define to_device_private_bus(obj) \ container_of(obj, struct device_private, knode_bus) #define to_device_private_class(obj) \ container_of(obj, struct device_private, knode_class) /* initialisation functions */ int devices_init(void); int buses_init(void); int classes_init(void); int firmware_init(void); #ifdef CONFIG_SYS_HYPERVISOR int hypervisor_init(void); #else static inline int hypervisor_init(void) { return 0; } #endif int platform_bus_init(void); int faux_bus_init(void); void cpu_dev_init(void); void container_dev_init(void); #ifdef CONFIG_AUXILIARY_BUS void auxiliary_bus_init(void); #else static inline void auxiliary_bus_init(void) { } #endif struct kobject *virtual_device_parent(void); int bus_add_device(struct device *dev); void bus_probe_device(struct device *dev); void bus_remove_device(struct device *dev); void bus_notify(struct device *dev, enum bus_notifier_event value); bool bus_is_registered(const struct bus_type *bus); int bus_add_driver(struct device_driver *drv); void bus_remove_driver(struct device_driver *drv); void device_release_driver_internal(struct device *dev, const struct device_driver *drv, struct device *parent); void driver_detach(const struct device_driver *drv); void driver_deferred_probe_del(struct device *dev); void device_set_deferred_probe_reason(const struct device *dev, struct va_format *vaf); static inline int driver_match_device(const struct device_driver *drv, struct device *dev) { return drv->bus->match ? drv->bus->match(dev, drv) : 1; } static inline void dev_sync_state(struct device *dev) { if (dev->bus->sync_state) dev->bus->sync_state(dev); else if (dev->driver && dev->driver->sync_state) dev->driver->sync_state(dev); } int driver_add_groups(const struct device_driver *drv, const struct attribute_group **groups); void driver_remove_groups(const struct device_driver *drv, const struct attribute_group **groups); void device_driver_detach(struct device *dev); static inline void device_set_driver(struct device *dev, const struct device_driver *drv) { /* * Majority (all?) read accesses to dev->driver happens either * while holding device lock or in bus/driver code that is only * invoked when the device is bound to a driver and there is no * concern of the pointer being changed while it is being read. * However when reading device's uevent file we read driver pointer * without taking device lock (so we do not block there for * arbitrary amount of time). We use WRITE_ONCE() here to prevent * tearing so that READ_ONCE() can safely be used in uevent code. */ // FIXME - this cast should not be needed "soon" WRITE_ONCE(dev->driver, (struct device_driver *)drv); } int devres_release_all(struct device *dev); void device_block_probing(void); void device_unblock_probing(void); void deferred_probe_extend_timeout(void); void driver_deferred_probe_trigger(void); const char *device_get_devnode(const struct device *dev, umode_t *mode, kuid_t *uid, kgid_t *gid, const char **tmp); /* /sys/devices directory */ extern struct kset *devices_kset; void devices_kset_move_last(struct device *dev); #if defined(CONFIG_MODULES) && defined(CONFIG_SYSFS) int module_add_driver(struct module *mod, const struct device_driver *drv); void module_remove_driver(const struct device_driver *drv); #else static inline int module_add_driver(struct module *mod, struct device_driver *drv) { return 0; } static inline void module_remove_driver(struct device_driver *drv) { } #endif #ifdef CONFIG_DEVTMPFS int devtmpfs_init(void); #else static inline int devtmpfs_init(void) { return 0; } #endif #ifdef CONFIG_BLOCK extern const struct class block_class; static inline bool is_blockdev(struct device *dev) { return dev->class == &block_class; } #else static inline bool is_blockdev(struct device *dev) { return false; } #endif /* Device links support */ int device_links_read_lock(void); void device_links_read_unlock(int idx); int device_links_read_lock_held(void); int device_links_check_suppliers(struct device *dev); void device_links_force_bind(struct device *dev); void device_links_driver_bound(struct device *dev); void device_links_driver_cleanup(struct device *dev); void device_links_no_driver(struct device *dev); bool device_links_busy(struct device *dev); void device_links_unbind_consumers(struct device *dev); bool device_link_flag_is_sync_state_only(u32 flags); void fw_devlink_drivers_done(void); void fw_devlink_probing_done(void); #define dev_for_each_link_to_supplier(__link, __dev) \ list_for_each_entry_srcu(__link, &(__dev)->links.suppliers, c_node, \ device_links_read_lock_held()) #define dev_for_each_link_to_consumer(__link, __dev) \ list_for_each_entry_srcu(__link, &(__dev)->links.consumers, s_node, \ device_links_read_lock_held()) /* device pm support */ void device_pm_move_to_tail(struct device *dev); #ifdef CONFIG_DEVTMPFS int devtmpfs_create_node(struct device *dev); int devtmpfs_delete_node(struct device *dev); #else static inline int devtmpfs_create_node(struct device *dev) { return 0; } static inline int devtmpfs_delete_node(struct device *dev) { return 0; } #endif void software_node_notify(struct device *dev); void software_node_notify_remove(struct device *dev);
13 13 13 13 13 5 5 5 13 13 13 13 13 13 13 13 13 12 13 12 13 12 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 81 13 69 69 69 69 82 82 82 82 13 69 6 81 81 75 75 35 36 9 1 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 // SPDX-License-Identifier: MIT /* * Copyright 2018 Noralf Trønnes * Copyright (c) 2006-2009 Red Hat Inc. * Copyright (c) 2006-2008 Intel Corporation * Jesse Barnes <jesse.barnes@intel.com> * Copyright (c) 2007 Dave Airlie <airlied@linux.ie> */ #include "drm/drm_modeset_lock.h" #include <linux/export.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/string_helpers.h> #include <drm/drm_atomic.h> #include <drm/drm_client.h> #include <drm/drm_connector.h> #include <drm/drm_crtc.h> #include <drm/drm_device.h> #include <drm/drm_drv.h> #include <drm/drm_edid.h> #include <drm/drm_encoder.h> #include <drm/drm_print.h> #include "drm_crtc_internal.h" #include "drm_internal.h" #define DRM_CLIENT_MAX_CLONED_CONNECTORS 8 struct drm_client_offset { int x, y; }; int drm_client_modeset_create(struct drm_client_dev *client) { struct drm_device *dev = client->dev; unsigned int num_crtc = dev->mode_config.num_crtc; unsigned int max_connector_count = 1; struct drm_mode_set *modeset; struct drm_crtc *crtc; int i = 0; /* Add terminating zero entry to enable index less iteration */ client->modesets = kcalloc(num_crtc + 1, sizeof(*client->modesets), GFP_KERNEL); if (!client->modesets) return -ENOMEM; mutex_init(&client->modeset_mutex); drm_for_each_crtc(crtc, dev) client->modesets[i++].crtc = crtc; /* Cloning is only supported in the single crtc case. */ if (num_crtc == 1) max_connector_count = DRM_CLIENT_MAX_CLONED_CONNECTORS; for (modeset = client->modesets; modeset->crtc; modeset++) { modeset->connectors = kcalloc(max_connector_count, sizeof(*modeset->connectors), GFP_KERNEL); if (!modeset->connectors) goto err_free; } return 0; err_free: drm_client_modeset_free(client); return -ENOMEM; } static void drm_client_modeset_release(struct drm_client_dev *client) { struct drm_mode_set *modeset; drm_client_for_each_modeset(modeset, client) { int i; drm_mode_destroy(client->dev, modeset->mode); modeset->mode = NULL; modeset->fb = NULL; for (i = 0; i < modeset->num_connectors; i++) { drm_connector_put(modeset->connectors[i]); modeset->connectors[i] = NULL; } modeset->num_connectors = 0; } } void drm_client_modeset_free(struct drm_client_dev *client) { struct drm_mode_set *modeset; mutex_lock(&client->modeset_mutex); drm_client_modeset_release(client); drm_client_for_each_modeset(modeset, client) kfree(modeset->connectors); mutex_unlock(&client->modeset_mutex); mutex_destroy(&client->modeset_mutex); kfree(client->modesets); } static struct drm_mode_set * drm_client_find_modeset(struct drm_client_dev *client, struct drm_crtc *crtc) { struct drm_mode_set *modeset; drm_client_for_each_modeset(modeset, client) if (modeset->crtc == crtc) return modeset; return NULL; } static const struct drm_display_mode * drm_connector_get_tiled_mode(struct drm_connector *connector) { const struct drm_display_mode *mode; list_for_each_entry(mode, &connector->modes, head) { if (mode->hdisplay == connector->tile_h_size && mode->vdisplay == connector->tile_v_size) return mode; } return NULL; } static const struct drm_display_mode * drm_connector_fallback_non_tiled_mode(struct drm_connector *connector) { const struct drm_display_mode *mode; list_for_each_entry(mode, &connector->modes, head) { if (mode->hdisplay == connector->tile_h_size && mode->vdisplay == connector->tile_v_size) continue; return mode; } return NULL; } static const struct drm_display_mode * drm_connector_preferred_mode(struct drm_connector *connector, int width, int height) { const struct drm_display_mode *mode; list_for_each_entry(mode, &connector->modes, head) { if (mode->hdisplay > width || mode->vdisplay > height) continue; if (mode->type & DRM_MODE_TYPE_PREFERRED) return mode; } return NULL; } static const struct drm_display_mode * drm_connector_first_mode(struct drm_connector *connector) { return list_first_entry_or_null(&connector->modes, struct drm_display_mode, head); } static const struct drm_display_mode * drm_connector_pick_cmdline_mode(struct drm_connector *connector) { const struct drm_cmdline_mode *cmdline_mode; const struct drm_display_mode *mode; bool prefer_non_interlace; /* * Find a user-defined mode. If the user gave us a valid * mode on the kernel command line, it will show up in this * list. */ list_for_each_entry(mode, &connector->modes, head) { if (mode->type & DRM_MODE_TYPE_USERDEF) return mode; } cmdline_mode = &connector->cmdline_mode; if (cmdline_mode->specified == false) return NULL; /* * Attempt to find a matching mode in the list of modes we * have gotten so far. */ prefer_non_interlace = !cmdline_mode->interlace; again: list_for_each_entry(mode, &connector->modes, head) { /* check width/height */ if (mode->hdisplay != cmdline_mode->xres || mode->vdisplay != cmdline_mode->yres) continue; if (cmdline_mode->refresh_specified) { if (drm_mode_vrefresh(mode) != cmdline_mode->refresh) continue; } if (cmdline_mode->interlace) { if (!(mode->flags & DRM_MODE_FLAG_INTERLACE)) continue; } else if (prefer_non_interlace) { if (mode->flags & DRM_MODE_FLAG_INTERLACE) continue; } return mode; } if (prefer_non_interlace) { prefer_non_interlace = false; goto again; } return NULL; } static bool drm_connector_enabled(struct drm_connector *connector, bool strict) { bool enable; if (connector->display_info.non_desktop) return false; if (strict) enable = connector->status == connector_status_connected; else enable = connector->status != connector_status_disconnected; return enable; } static void drm_client_connectors_enabled(struct drm_connector *connectors[], unsigned int connector_count, bool enabled[]) { bool any_enabled = false; struct drm_connector *connector; int i = 0; for (i = 0; i < connector_count; i++) { connector = connectors[i]; enabled[i] = drm_connector_enabled(connector, true); drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] enabled? %s\n", connector->base.id, connector->name, connector->display_info.non_desktop ? "non desktop" : str_yes_no(enabled[i])); any_enabled |= enabled[i]; } if (any_enabled) return; for (i = 0; i < connector_count; i++) enabled[i] = drm_connector_enabled(connectors[i], false); } static void mode_replace(struct drm_device *dev, const struct drm_display_mode **dst, const struct drm_display_mode *src) { drm_mode_destroy(dev, (struct drm_display_mode *)*dst); *dst = src ? drm_mode_duplicate(dev, src) : NULL; } static void modes_destroy(struct drm_device *dev, const struct drm_display_mode *modes[], int count) { int i; for (i = 0; i < count; i++) mode_replace(dev, &modes[i], NULL); } static bool drm_client_target_cloned(struct drm_device *dev, struct drm_connector *connectors[], unsigned int connector_count, const struct drm_display_mode *modes[], struct drm_client_offset offsets[], bool enabled[], int width, int height) { int count, i; bool can_clone = false; struct drm_display_mode *dmt_mode; /* only contemplate cloning in the single crtc case */ if (dev->mode_config.num_crtc > 1) return false; count = 0; for (i = 0; i < connector_count; i++) { if (enabled[i]) count++; } /* only contemplate cloning if more than one connector is enabled */ if (count <= 1) return false; /* check the command line or if nothing common pick 1024x768 */ can_clone = true; for (i = 0; i < connector_count; i++) { int j; if (!enabled[i]) continue; mode_replace(dev, &modes[i], drm_connector_pick_cmdline_mode(connectors[i])); if (!modes[i]) { can_clone = false; break; } for (j = 0; j < i; j++) { if (!enabled[j]) continue; if (!drm_mode_match(modes[j], modes[i], DRM_MODE_MATCH_TIMINGS | DRM_MODE_MATCH_CLOCK | DRM_MODE_MATCH_FLAGS | DRM_MODE_MATCH_3D_FLAGS)) can_clone = false; } } if (can_clone) { drm_dbg_kms(dev, "can clone using command line\n"); return true; } /* try and find a 1024x768 mode on each connector */ can_clone = true; dmt_mode = drm_mode_find_dmt(dev, 1024, 768, 60, false); if (!dmt_mode) goto fail; for (i = 0; i < connector_count; i++) { const struct drm_display_mode *mode; if (!enabled[i]) continue; list_for_each_entry(mode, &connectors[i]->modes, head) { if (drm_mode_match(mode, dmt_mode, DRM_MODE_MATCH_TIMINGS | DRM_MODE_MATCH_CLOCK | DRM_MODE_MATCH_FLAGS | DRM_MODE_MATCH_3D_FLAGS)) mode_replace(dev, &modes[i], mode); } if (!modes[i]) can_clone = false; } drm_mode_destroy(dev, dmt_mode); if (can_clone) { drm_dbg_kms(dev, "can clone using 1024x768\n"); return true; } fail: drm_info(dev, "kms: can't enable cloning when we probably wanted to.\n"); return false; } static int drm_client_get_tile_offsets(struct drm_device *dev, struct drm_connector *connectors[], unsigned int connector_count, const struct drm_display_mode *modes[], struct drm_client_offset offsets[], int idx, int h_idx, int v_idx) { int i; int hoffset = 0, voffset = 0; for (i = 0; i < connector_count; i++) { struct drm_connector *connector = connectors[i]; if (!connector->has_tile) continue; if (!modes[i] && (h_idx || v_idx)) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] no modes for connector tiled %d\n", connector->base.id, connector->name, i); continue; } if (connector->tile_h_loc < h_idx) hoffset += modes[i]->hdisplay; if (connector->tile_v_loc < v_idx) voffset += modes[i]->vdisplay; } offsets[idx].x = hoffset; offsets[idx].y = voffset; drm_dbg_kms(dev, "returned %d %d for %d %d\n", hoffset, voffset, h_idx, v_idx); return 0; } static bool drm_client_target_preferred(struct drm_device *dev, struct drm_connector *connectors[], unsigned int connector_count, const struct drm_display_mode *modes[], struct drm_client_offset offsets[], bool enabled[], int width, int height) { const u64 mask = BIT_ULL(connector_count) - 1; u64 conn_configured = 0; int tile_pass = 0; int num_tiled_conns = 0; int i; for (i = 0; i < connector_count; i++) { if (connectors[i]->has_tile && connectors[i]->status == connector_status_connected) num_tiled_conns++; } retry: for (i = 0; i < connector_count; i++) { struct drm_connector *connector = connectors[i]; const char *mode_type; if (conn_configured & BIT_ULL(i)) continue; if (enabled[i] == false) { conn_configured |= BIT_ULL(i); continue; } /* first pass over all the untiled connectors */ if (tile_pass == 0 && connector->has_tile) continue; if (tile_pass == 1) { if (connector->tile_h_loc != 0 || connector->tile_v_loc != 0) continue; } else { if (connector->tile_h_loc != tile_pass - 1 && connector->tile_v_loc != tile_pass - 1) /* if this tile_pass doesn't cover any of the tiles - keep going */ continue; /* * find the tile offsets for this pass - need to find * all tiles left and above */ drm_client_get_tile_offsets(dev, connectors, connector_count, modes, offsets, i, connector->tile_h_loc, connector->tile_v_loc); } mode_type = "cmdline"; mode_replace(dev, &modes[i], drm_connector_pick_cmdline_mode(connector)); if (!modes[i]) { mode_type = "preferred"; mode_replace(dev, &modes[i], drm_connector_preferred_mode(connector, width, height)); } if (!modes[i]) { mode_type = "first"; mode_replace(dev, &modes[i], drm_connector_first_mode(connector)); } /* * In case of tiled mode if all tiles not present fallback to * first available non tiled mode. * After all tiles are present, try to find the tiled mode * for all and if tiled mode not present due to fbcon size * limitations, use first non tiled mode only for * tile 0,0 and set to no mode for all other tiles. */ if (connector->has_tile) { if (num_tiled_conns < connector->num_h_tile * connector->num_v_tile || (connector->tile_h_loc == 0 && connector->tile_v_loc == 0 && !drm_connector_get_tiled_mode(connector))) { mode_type = "non tiled"; mode_replace(dev, &modes[i], drm_connector_fallback_non_tiled_mode(connector)); } else { mode_type = "tiled"; mode_replace(dev, &modes[i], drm_connector_get_tiled_mode(connector)); } } if (modes[i]) drm_dbg_kms(dev, "[CONNECTOR:%d:%s] found %s mode: %s\n", connector->base.id, connector->name, mode_type, modes[i]->name); else drm_dbg_kms(dev, "[CONNECTOR:%d:%s] no mode found\n", connector->base.id, connector->name); conn_configured |= BIT_ULL(i); } if ((conn_configured & mask) != mask) { tile_pass++; goto retry; } return true; } static bool connector_has_possible_crtc(struct drm_connector *connector, struct drm_crtc *crtc) { struct drm_encoder *encoder; drm_connector_for_each_possible_encoder(connector, encoder) { if (encoder->possible_crtcs & drm_crtc_mask(crtc)) return true; } return false; } static int drm_client_pick_crtcs(struct drm_client_dev *client, struct drm_connector *connectors[], unsigned int connector_count, struct drm_crtc *best_crtcs[], const struct drm_display_mode *modes[], int n, int width, int height) { struct drm_device *dev = client->dev; struct drm_connector *connector; int my_score, best_score, score; struct drm_crtc **crtcs; struct drm_mode_set *modeset; if (n == connector_count) return 0; connector = connectors[n]; best_crtcs[n] = NULL; best_score = drm_client_pick_crtcs(client, connectors, connector_count, best_crtcs, modes, n + 1, width, height); if (modes[n] == NULL) return best_score; crtcs = kcalloc(connector_count, sizeof(*crtcs), GFP_KERNEL); if (!crtcs) return best_score; my_score = 1; if (connector->status == connector_status_connected) my_score++; if (connector->cmdline_mode.specified) my_score++; if (drm_connector_preferred_mode(connector, width, height)) my_score++; /* * select a crtc for this connector and then attempt to configure * remaining connectors */ drm_client_for_each_modeset(modeset, client) { struct drm_crtc *crtc = modeset->crtc; int o; if (!connector_has_possible_crtc(connector, crtc)) continue; for (o = 0; o < n; o++) if (best_crtcs[o] == crtc) break; if (o < n) { /* ignore cloning unless only a single crtc */ if (dev->mode_config.num_crtc > 1) continue; if (!drm_mode_equal(modes[o], modes[n])) continue; } crtcs[n] = crtc; memcpy(crtcs, best_crtcs, n * sizeof(*crtcs)); score = my_score + drm_client_pick_crtcs(client, connectors, connector_count, crtcs, modes, n + 1, width, height); if (score > best_score) { best_score = score; memcpy(best_crtcs, crtcs, connector_count * sizeof(*crtcs)); } } kfree(crtcs); return best_score; } /* Try to read the BIOS display configuration and use it for the initial config */ static bool drm_client_firmware_config(struct drm_client_dev *client, struct drm_connector *connectors[], unsigned int connector_count, struct drm_crtc *crtcs[], const struct drm_display_mode *modes[], struct drm_client_offset offsets[], bool enabled[], int width, int height) { const int count = min_t(unsigned int, connector_count, BITS_PER_LONG); unsigned long conn_configured, conn_seq, mask; struct drm_device *dev = client->dev; int i; bool *save_enabled; bool fallback = true, ret = true; int num_connectors_enabled = 0; int num_connectors_detected = 0; int num_tiled_conns = 0; struct drm_modeset_acquire_ctx ctx; if (!drm_drv_uses_atomic_modeset(dev)) return false; if (drm_WARN_ON(dev, count <= 0)) return false; save_enabled = kcalloc(count, sizeof(bool), GFP_KERNEL); if (!save_enabled) return false; drm_modeset_acquire_init(&ctx, 0); while (drm_modeset_lock_all_ctx(dev, &ctx) != 0) drm_modeset_backoff(&ctx); memcpy(save_enabled, enabled, count); mask = GENMASK(count - 1, 0); conn_configured = 0; for (i = 0; i < count; i++) { if (connectors[i]->has_tile && connectors[i]->status == connector_status_connected) num_tiled_conns++; } retry: conn_seq = conn_configured; for (i = 0; i < count; i++) { struct drm_connector *connector = connectors[i]; struct drm_encoder *encoder; struct drm_crtc *crtc; const char *mode_type; int j; if (conn_configured & BIT(i)) continue; if (conn_seq == 0 && !connector->has_tile) continue; if (connector->status == connector_status_connected) num_connectors_detected++; if (!enabled[i]) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] not enabled, skipping\n", connector->base.id, connector->name); conn_configured |= BIT(i); continue; } if (connector->force == DRM_FORCE_OFF) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] disabled by user, skipping\n", connector->base.id, connector->name); enabled[i] = false; continue; } encoder = connector->state->best_encoder; if (!encoder || drm_WARN_ON(dev, !connector->state->crtc)) { if (connector->force > DRM_FORCE_OFF) goto bail; drm_dbg_kms(dev, "[CONNECTOR:%d:%s] has no encoder or crtc, skipping\n", connector->base.id, connector->name); enabled[i] = false; conn_configured |= BIT(i); continue; } num_connectors_enabled++; crtc = connector->state->crtc; /* * Make sure we're not trying to drive multiple connectors * with a single CRTC, since our cloning support may not * match the BIOS. */ for (j = 0; j < count; j++) { if (crtcs[j] == crtc) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] fallback: cloned configuration\n", connector->base.id, connector->name); goto bail; } } mode_type = "cmdline"; mode_replace(dev, &modes[i], drm_connector_pick_cmdline_mode(connector)); if (!modes[i]) { mode_type = "preferred"; mode_replace(dev, &modes[i], drm_connector_preferred_mode(connector, width, height)); } if (!modes[i]) { mode_type = "first"; mode_replace(dev, &modes[i], drm_connector_first_mode(connector)); } /* last resort: use current mode */ if (!modes[i]) { mode_type = "current"; mode_replace(dev, &modes[i], &crtc->state->mode); } /* * In case of tiled modes, if all tiles are not present * then fallback to a non tiled mode. */ if (connector->has_tile && num_tiled_conns < connector->num_h_tile * connector->num_v_tile) { mode_type = "non tiled"; mode_replace(dev, &modes[i], drm_connector_fallback_non_tiled_mode(connector)); } crtcs[i] = crtc; drm_dbg_kms(dev, "[CONNECTOR::%d:%s] on [CRTC:%d:%s] using %s mode: %s\n", connector->base.id, connector->name, crtc->base.id, crtc->name, mode_type, modes[i]->name); fallback = false; conn_configured |= BIT(i); } if ((conn_configured & mask) != mask && conn_configured != conn_seq) goto retry; for (i = 0; i < count; i++) { struct drm_connector *connector = connectors[i]; if (connector->has_tile) drm_client_get_tile_offsets(dev, connectors, connector_count, modes, offsets, i, connector->tile_h_loc, connector->tile_v_loc); } /* * If the BIOS didn't enable everything it could, fall back to have the * same user experiencing of lighting up as much as possible like the * fbdev helper library. */ if (num_connectors_enabled != num_connectors_detected && num_connectors_enabled < dev->mode_config.num_crtc) { drm_dbg_kms(dev, "fallback: Not all outputs enabled\n"); drm_dbg_kms(dev, "Enabled: %i, detected: %i\n", num_connectors_enabled, num_connectors_detected); fallback = true; } if (fallback) { bail: drm_dbg_kms(dev, "Not using firmware configuration\n"); memcpy(enabled, save_enabled, count); ret = false; } drm_modeset_drop_locks(&ctx); drm_modeset_acquire_fini(&ctx); kfree(save_enabled); return ret; } /** * drm_client_modeset_probe() - Probe for displays * @client: DRM client * @width: Maximum display mode width (optional) * @height: Maximum display mode height (optional) * * This function sets up display pipelines for enabled connectors and stores the * config in the client's modeset array. * * Returns: * Zero on success or negative error code on failure. */ int drm_client_modeset_probe(struct drm_client_dev *client, unsigned int width, unsigned int height) { struct drm_connector *connector, **connectors = NULL; struct drm_connector_list_iter conn_iter; struct drm_device *dev = client->dev; unsigned int total_modes_count = 0; struct drm_client_offset *offsets; unsigned int connector_count = 0; const struct drm_display_mode **modes; struct drm_crtc **crtcs; int i, ret = 0; bool *enabled; drm_dbg_kms(dev, "\n"); if (!width) width = dev->mode_config.max_width; if (!height) height = dev->mode_config.max_height; drm_connector_list_iter_begin(dev, &conn_iter); drm_client_for_each_connector_iter(connector, &conn_iter) { struct drm_connector **tmp; tmp = krealloc(connectors, (connector_count + 1) * sizeof(*connectors), GFP_KERNEL); if (!tmp) { ret = -ENOMEM; goto free_connectors; } connectors = tmp; drm_connector_get(connector); connectors[connector_count++] = connector; } drm_connector_list_iter_end(&conn_iter); if (!connector_count) return 0; crtcs = kcalloc(connector_count, sizeof(*crtcs), GFP_KERNEL); modes = kcalloc(connector_count, sizeof(*modes), GFP_KERNEL); offsets = kcalloc(connector_count, sizeof(*offsets), GFP_KERNEL); enabled = kcalloc(connector_count, sizeof(bool), GFP_KERNEL); if (!crtcs || !modes || !enabled || !offsets) { ret = -ENOMEM; goto out; } mutex_lock(&client->modeset_mutex); mutex_lock(&dev->mode_config.mutex); for (i = 0; i < connector_count; i++) total_modes_count += connectors[i]->funcs->fill_modes(connectors[i], width, height); if (!total_modes_count) drm_dbg_kms(dev, "No connectors reported connected with modes\n"); drm_client_connectors_enabled(connectors, connector_count, enabled); if (!drm_client_firmware_config(client, connectors, connector_count, crtcs, modes, offsets, enabled, width, height)) { modes_destroy(dev, modes, connector_count); memset(crtcs, 0, connector_count * sizeof(*crtcs)); memset(offsets, 0, connector_count * sizeof(*offsets)); if (!drm_client_target_cloned(dev, connectors, connector_count, modes, offsets, enabled, width, height) && !drm_client_target_preferred(dev, connectors, connector_count, modes, offsets, enabled, width, height)) drm_err(dev, "Unable to find initial modes\n"); drm_dbg_kms(dev, "picking CRTCs for %dx%d config\n", width, height); drm_client_pick_crtcs(client, connectors, connector_count, crtcs, modes, 0, width, height); } mutex_unlock(&dev->mode_config.mutex); drm_client_modeset_release(client); for (i = 0; i < connector_count; i++) { const struct drm_display_mode *mode = modes[i]; struct drm_crtc *crtc = crtcs[i]; struct drm_client_offset *offset = &offsets[i]; if (mode && crtc) { struct drm_mode_set *modeset = drm_client_find_modeset(client, crtc); struct drm_connector *connector = connectors[i]; drm_dbg_kms(dev, "[CRTC:%d:%s] desired mode %s set (%d,%d)\n", crtc->base.id, crtc->name, mode->name, offset->x, offset->y); if (drm_WARN_ON_ONCE(dev, modeset->num_connectors == DRM_CLIENT_MAX_CLONED_CONNECTORS || (dev->mode_config.num_crtc > 1 && modeset->num_connectors == 1))) { ret = -EINVAL; break; } drm_mode_destroy(dev, modeset->mode); modeset->mode = drm_mode_duplicate(dev, mode); if (!modeset->mode) { ret = -ENOMEM; break; } drm_connector_get(connector); modeset->connectors[modeset->num_connectors++] = connector; modeset->x = offset->x; modeset->y = offset->y; } } mutex_unlock(&client->modeset_mutex); out: kfree(crtcs); modes_destroy(dev, modes, connector_count); kfree(modes); kfree(offsets); kfree(enabled); free_connectors: for (i = 0; i < connector_count; i++) drm_connector_put(connectors[i]); kfree(connectors); return ret; } EXPORT_SYMBOL(drm_client_modeset_probe); /** * drm_client_rotation() - Check the initial rotation value * @modeset: DRM modeset * @rotation: Returned rotation value * * This function checks if the primary plane in @modeset can hw rotate * to match the rotation needed on its connector. * * Note: Currently only 0 and 180 degrees are supported. * * Return: * True if the plane can do the rotation, false otherwise. */ bool drm_client_rotation(struct drm_mode_set *modeset, unsigned int *rotation) { struct drm_connector *connector = modeset->connectors[0]; struct drm_plane *plane = modeset->crtc->primary; struct drm_cmdline_mode *cmdline; u64 valid_mask = 0; int i; if (!modeset->num_connectors) return false; switch (connector->display_info.panel_orientation) { case DRM_MODE_PANEL_ORIENTATION_BOTTOM_UP: *rotation = DRM_MODE_ROTATE_180; break; case DRM_MODE_PANEL_ORIENTATION_LEFT_UP: *rotation = DRM_MODE_ROTATE_90; break; case DRM_MODE_PANEL_ORIENTATION_RIGHT_UP: *rotation = DRM_MODE_ROTATE_270; break; default: *rotation = DRM_MODE_ROTATE_0; } /** * The panel already defined the default rotation * through its orientation. Whatever has been provided * on the command line needs to be added to that. * * Unfortunately, the rotations are at different bit * indices, so the math to add them up are not as * trivial as they could. * * Reflections on the other hand are pretty trivial to deal with, a * simple XOR between the two handle the addition nicely. */ cmdline = &connector->cmdline_mode; if (cmdline->specified && cmdline->rotation_reflection) { unsigned int cmdline_rest, panel_rest; unsigned int cmdline_rot, panel_rot; unsigned int sum_rot, sum_rest; panel_rot = ilog2(*rotation & DRM_MODE_ROTATE_MASK); cmdline_rot = ilog2(cmdline->rotation_reflection & DRM_MODE_ROTATE_MASK); sum_rot = (panel_rot + cmdline_rot) % 4; panel_rest = *rotation & ~DRM_MODE_ROTATE_MASK; cmdline_rest = cmdline->rotation_reflection & ~DRM_MODE_ROTATE_MASK; sum_rest = panel_rest ^ cmdline_rest; *rotation = (1 << sum_rot) | sum_rest; } /* * TODO: support 90 / 270 degree hardware rotation, * depending on the hardware this may require the framebuffer * to be in a specific tiling format. */ if (((*rotation & DRM_MODE_ROTATE_MASK) != DRM_MODE_ROTATE_0 && (*rotation & DRM_MODE_ROTATE_MASK) != DRM_MODE_ROTATE_180) || !plane->rotation_property) return false; for (i = 0; i < plane->rotation_property->num_values; i++) valid_mask |= (1ULL << plane->rotation_property->values[i]); if (!(*rotation & valid_mask)) return false; return true; } EXPORT_SYMBOL(drm_client_rotation); static int drm_client_modeset_commit_atomic(struct drm_client_dev *client, bool active, bool check) { struct drm_device *dev = client->dev; struct drm_plane *plane; struct drm_atomic_state *state; struct drm_modeset_acquire_ctx ctx; struct drm_mode_set *mode_set; int ret; drm_modeset_acquire_init(&ctx, 0); state = drm_atomic_state_alloc(dev); if (!state) { ret = -ENOMEM; goto out_ctx; } state->acquire_ctx = &ctx; retry: drm_for_each_plane(plane, dev) { struct drm_plane_state *plane_state; plane_state = drm_atomic_get_plane_state(state, plane); if (IS_ERR(plane_state)) { ret = PTR_ERR(plane_state); goto out_state; } plane_state->rotation = DRM_MODE_ROTATE_0; /* disable non-primary: */ if (plane->type == DRM_PLANE_TYPE_PRIMARY) continue; ret = __drm_atomic_helper_disable_plane(plane, plane_state); if (ret != 0) goto out_state; } drm_client_for_each_modeset(mode_set, client) { struct drm_plane *primary = mode_set->crtc->primary; unsigned int rotation; if (drm_client_rotation(mode_set, &rotation)) { struct drm_plane_state *plane_state; /* Cannot fail as we've already gotten the plane state above */ plane_state = drm_atomic_get_new_plane_state(state, primary); plane_state->rotation = rotation; } ret = __drm_atomic_helper_set_config(mode_set, state); if (ret != 0) goto out_state; /* * __drm_atomic_helper_set_config() sets active when a * mode is set, unconditionally clear it if we force DPMS off */ if (!active) { struct drm_crtc *crtc = mode_set->crtc; struct drm_crtc_state *crtc_state = drm_atomic_get_new_crtc_state(state, crtc); crtc_state->active = false; } } if (check) ret = drm_atomic_check_only(state); else ret = drm_atomic_commit(state); out_state: if (ret == -EDEADLK) goto backoff; drm_atomic_state_put(state); out_ctx: drm_modeset_drop_locks(&ctx); drm_modeset_acquire_fini(&ctx); return ret; backoff: drm_atomic_state_clear(state); drm_modeset_backoff(&ctx); goto retry; } static int drm_client_modeset_commit_legacy(struct drm_client_dev *client) { struct drm_device *dev = client->dev; struct drm_mode_set *mode_set; struct drm_plane *plane; int ret = 0; drm_modeset_lock_all(dev); drm_for_each_plane(plane, dev) { if (plane->type != DRM_PLANE_TYPE_PRIMARY) drm_plane_force_disable(plane); if (plane->rotation_property) drm_mode_plane_set_obj_prop(plane, plane->rotation_property, DRM_MODE_ROTATE_0); } drm_client_for_each_modeset(mode_set, client) { struct drm_crtc *crtc = mode_set->crtc; if (crtc->funcs->cursor_set2) { ret = crtc->funcs->cursor_set2(crtc, NULL, 0, 0, 0, 0, 0); if (ret) goto out; } else if (crtc->funcs->cursor_set) { ret = crtc->funcs->cursor_set(crtc, NULL, 0, 0, 0); if (ret) goto out; } ret = drm_mode_set_config_internal(mode_set); if (ret) goto out; } out: drm_modeset_unlock_all(dev); return ret; } /** * drm_client_modeset_check() - Check modeset configuration * @client: DRM client * * Check modeset configuration. * * Returns: * Zero on success or negative error code on failure. */ int drm_client_modeset_check(struct drm_client_dev *client) { int ret; if (!drm_drv_uses_atomic_modeset(client->dev)) return 0; mutex_lock(&client->modeset_mutex); ret = drm_client_modeset_commit_atomic(client, true, true); mutex_unlock(&client->modeset_mutex); return ret; } EXPORT_SYMBOL(drm_client_modeset_check); /** * drm_client_modeset_commit_locked() - Force commit CRTC configuration * @client: DRM client * * Commit modeset configuration to crtcs without checking if there is a DRM * master. The assumption is that the caller already holds an internal DRM * master reference acquired with drm_master_internal_acquire(). * * Returns: * Zero on success or negative error code on failure. */ int drm_client_modeset_commit_locked(struct drm_client_dev *client) { struct drm_device *dev = client->dev; int ret; mutex_lock(&client->modeset_mutex); if (drm_drv_uses_atomic_modeset(dev)) ret = drm_client_modeset_commit_atomic(client, true, false); else ret = drm_client_modeset_commit_legacy(client); mutex_unlock(&client->modeset_mutex); return ret; } EXPORT_SYMBOL(drm_client_modeset_commit_locked); /** * drm_client_modeset_commit() - Commit CRTC configuration * @client: DRM client * * Commit modeset configuration to crtcs. * * Returns: * Zero on success or negative error code on failure. */ int drm_client_modeset_commit(struct drm_client_dev *client) { struct drm_device *dev = client->dev; int ret; if (!drm_master_internal_acquire(dev)) return -EBUSY; ret = drm_client_modeset_commit_locked(client); drm_master_internal_release(dev); return ret; } EXPORT_SYMBOL(drm_client_modeset_commit); static void drm_client_modeset_dpms_legacy(struct drm_client_dev *client, int dpms_mode) { struct drm_device *dev = client->dev; struct drm_connector *connector; struct drm_mode_set *modeset; struct drm_modeset_acquire_ctx ctx; int ret; DRM_MODESET_LOCK_ALL_BEGIN(dev, ctx, 0, ret); drm_client_for_each_modeset(modeset, client) { int j; if (!modeset->crtc->enabled) continue; for (j = 0; j < modeset->num_connectors; j++) { connector = modeset->connectors[j]; connector->funcs->dpms(connector, dpms_mode); drm_object_property_set_value(&connector->base, dev->mode_config.dpms_property, dpms_mode); } } DRM_MODESET_LOCK_ALL_END(dev, ctx, ret); } /** * drm_client_modeset_dpms() - Set DPMS mode * @client: DRM client * @mode: DPMS mode * * Note: For atomic drivers @mode is reduced to on/off. * * Returns: * Zero on success or negative error code on failure. */ int drm_client_modeset_dpms(struct drm_client_dev *client, int mode) { struct drm_device *dev = client->dev; int ret = 0; if (!drm_master_internal_acquire(dev)) return -EBUSY; mutex_lock(&client->modeset_mutex); if (drm_drv_uses_atomic_modeset(dev)) ret = drm_client_modeset_commit_atomic(client, mode == DRM_MODE_DPMS_ON, false); else drm_client_modeset_dpms_legacy(client, mode); mutex_unlock(&client->modeset_mutex); drm_master_internal_release(dev); return ret; } EXPORT_SYMBOL(drm_client_modeset_dpms); #ifdef CONFIG_DRM_KUNIT_TEST #include "tests/drm_client_modeset_test.c" #endif
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * RNG: Random Number Generator algorithms under the crypto API * * Copyright (c) 2008 Neil Horman <nhorman@tuxdriver.com> * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_RNG_H #define _CRYPTO_RNG_H #include <linux/atomic.h> #include <linux/container_of.h> #include <linux/crypto.h> struct crypto_rng; /** * struct rng_alg - random number generator definition * * @generate: The function defined by this variable obtains a * random number. The random number generator transform * must generate the random number out of the context * provided with this call, plus any additional data * if provided to the call. * @seed: Seed or reseed the random number generator. With the * invocation of this function call, the random number * generator shall become ready for generation. If the * random number generator requires a seed for setting * up a new state, the seed must be provided by the * consumer while invoking this function. The required * size of the seed is defined with @seedsize . * @set_ent: Set entropy that would otherwise be obtained from * entropy source. Internal use only. * @seedsize: The seed size required for a random number generator * initialization defined with this variable. Some * random number generators does not require a seed * as the seeding is implemented internally without * the need of support by the consumer. In this case, * the seed size is set to zero. * @base: Common crypto API algorithm data structure. */ struct rng_alg { int (*generate)(struct crypto_rng *tfm, const u8 *src, unsigned int slen, u8 *dst, unsigned int dlen); int (*seed)(struct crypto_rng *tfm, const u8 *seed, unsigned int slen); void (*set_ent)(struct crypto_rng *tfm, const u8 *data, unsigned int len); unsigned int seedsize; struct crypto_alg base; }; struct crypto_rng { struct crypto_tfm base; }; extern struct crypto_rng *crypto_default_rng; int crypto_get_default_rng(void); void crypto_put_default_rng(void); /** * DOC: Random number generator API * * The random number generator API is used with the ciphers of type * CRYPTO_ALG_TYPE_RNG (listed as type "rng" in /proc/crypto) */ /** * crypto_alloc_rng() -- allocate RNG handle * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * message digest cipher * @type: specifies the type of the cipher * @mask: specifies the mask for the cipher * * Allocate a cipher handle for a random number generator. The returned struct * crypto_rng is the cipher handle that is required for any subsequent * API invocation for that random number generator. * * For all random number generators, this call creates a new private copy of * the random number generator that does not share a state with other * instances. The only exception is the "krng" random number generator which * is a kernel crypto API use case for the get_random_bytes() function of the * /dev/random driver. * * Return: allocated cipher handle in case of success; IS_ERR() is true in case * of an error, PTR_ERR() returns the error code. */ struct crypto_rng *crypto_alloc_rng(const char *alg_name, u32 type, u32 mask); static inline struct crypto_tfm *crypto_rng_tfm(struct crypto_rng *tfm) { return &tfm->base; } static inline struct rng_alg *__crypto_rng_alg(struct crypto_alg *alg) { return container_of(alg, struct rng_alg, base); } /** * crypto_rng_alg() - obtain 'struct rng_alg' pointer from RNG handle * @tfm: RNG handle * * Return: Pointer to 'struct rng_alg', derived from @tfm RNG handle */ static inline struct rng_alg *crypto_rng_alg(struct crypto_rng *tfm) { return __crypto_rng_alg(crypto_rng_tfm(tfm)->__crt_alg); } /** * crypto_free_rng() - zeroize and free RNG handle * @tfm: cipher handle to be freed * * If @tfm is a NULL or error pointer, this function does nothing. */ static inline void crypto_free_rng(struct crypto_rng *tfm) { crypto_destroy_tfm(tfm, crypto_rng_tfm(tfm)); } /** * crypto_rng_generate() - get random number * @tfm: cipher handle * @src: Input buffer holding additional data, may be NULL * @slen: Length of additional data * @dst: output buffer holding the random numbers * @dlen: length of the output buffer * * This function fills the caller-allocated buffer with random * numbers using the random number generator referenced by the * cipher handle. * * Return: 0 function was successful; < 0 if an error occurred */ static inline int crypto_rng_generate(struct crypto_rng *tfm, const u8 *src, unsigned int slen, u8 *dst, unsigned int dlen) { return crypto_rng_alg(tfm)->generate(tfm, src, slen, dst, dlen); } /** * crypto_rng_get_bytes() - get random number * @tfm: cipher handle * @rdata: output buffer holding the random numbers * @dlen: length of the output buffer * * This function fills the caller-allocated buffer with random numbers using the * random number generator referenced by the cipher handle. * * Return: 0 function was successful; < 0 if an error occurred */ static inline int crypto_rng_get_bytes(struct crypto_rng *tfm, u8 *rdata, unsigned int dlen) { return crypto_rng_generate(tfm, NULL, 0, rdata, dlen); } /** * crypto_rng_reset() - re-initialize the RNG * @tfm: cipher handle * @seed: seed input data * @slen: length of the seed input data * * The reset function completely re-initializes the random number generator * referenced by the cipher handle by clearing the current state. The new state * is initialized with the caller provided seed or automatically, depending * on the random number generator type (the ANSI X9.31 RNG requires * caller-provided seed, the SP800-90A DRBGs perform an automatic seeding). * The seed is provided as a parameter to this function call. The provided seed * should have the length of the seed size defined for the random number * generator as defined by crypto_rng_seedsize. * * Return: 0 if the setting of the key was successful; < 0 if an error occurred */ int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen); /** * crypto_rng_seedsize() - obtain seed size of RNG * @tfm: cipher handle * * The function returns the seed size for the random number generator * referenced by the cipher handle. This value may be zero if the random * number generator does not implement or require a reseeding. For example, * the SP800-90A DRBGs implement an automated reseeding after reaching a * pre-defined threshold. * * Return: seed size for the random number generator */ static inline int crypto_rng_seedsize(struct crypto_rng *tfm) { return crypto_rng_alg(tfm)->seedsize; } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef __DSA_TAG_H #define __DSA_TAG_H #include <linux/if_vlan.h> #include <linux/list.h> #include <linux/types.h> #include <net/dsa.h> #include "port.h" #include "user.h" struct dsa_tag_driver { const struct dsa_device_ops *ops; struct list_head list; struct module *owner; }; extern struct packet_type dsa_pack_type; const struct dsa_device_ops *dsa_tag_driver_get_by_id(int tag_protocol); const struct dsa_device_ops *dsa_tag_driver_get_by_name(const char *name); void dsa_tag_driver_put(const struct dsa_device_ops *ops); const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops); static inline int dsa_tag_protocol_overhead(const struct dsa_device_ops *ops) { return ops->needed_headroom + ops->needed_tailroom; } static inline struct net_device *dsa_conduit_find_user(struct net_device *dev, int device, int port) { struct dsa_port *cpu_dp = dev->dsa_ptr; struct dsa_switch_tree *dst = cpu_dp->dst; struct dsa_port *dp; list_for_each_entry(dp, &dst->ports, list) if (dp->ds->index == device && dp->index == port && dp->type == DSA_PORT_TYPE_USER) return dp->user; return NULL; } /** * dsa_software_untag_vlan_aware_bridge: Software untagging for VLAN-aware bridge * @skb: Pointer to received socket buffer (packet) * @br: Pointer to bridge upper interface of ingress port * @vid: Parsed VID from packet * * The bridge can process tagged packets. Software like STP/PTP may not. The * bridge can also process untagged packets, to the same effect as if they were * tagged with the PVID of the ingress port. So packets tagged with the PVID of * the bridge port must be software-untagged, to support both use cases. */ static inline void dsa_software_untag_vlan_aware_bridge(struct sk_buff *skb, struct net_device *br, u16 vid) { u16 pvid, proto; int err; err = br_vlan_get_proto(br, &proto); if (err) return; err = br_vlan_get_pvid_rcu(skb->dev, &pvid); if (err) return; if (vid == pvid && skb->vlan_proto == htons(proto)) __vlan_hwaccel_clear_tag(skb); } /** * dsa_software_untag_vlan_unaware_bridge: Software untagging for VLAN-unaware bridge * @skb: Pointer to received socket buffer (packet) * @br: Pointer to bridge upper interface of ingress port * @vid: Parsed VID from packet * * The bridge ignores all VLAN tags. Software like STP/PTP may not (it may run * on the plain port, or on a VLAN upper interface). Maybe packets are coming * to software as tagged with a driver-defined VID which is NOT equal to the * PVID of the bridge port (since the bridge is VLAN-unaware, its configuration * should NOT be committed to hardware). DSA needs a method for this private * VID to be communicated by software to it, and if packets are tagged with it, * software-untag them. Note: the private VID may be different per bridge, to * support the FDB isolation use case. * * FIXME: this is currently implemented based on the broken assumption that * the "private VID" used by the driver in VLAN-unaware mode is equal to the * bridge PVID. It should not be, except for a coincidence; the bridge PVID is * irrelevant to the data path in the VLAN-unaware mode. Thus, the VID that * this function removes is wrong. * * All users of ds->untag_bridge_pvid should fix their drivers, if necessary, * to make the two independent. Only then, if there still remains a need to * strip the private VID from packets, then a new ds->ops->get_private_vid() * API shall be introduced to communicate to DSA what this VID is, which needs * to be stripped here. */ static inline void dsa_software_untag_vlan_unaware_bridge(struct sk_buff *skb, struct net_device *br, u16 vid) { struct net_device *upper_dev; u16 pvid, proto; int err; err = br_vlan_get_proto(br, &proto); if (err) return; err = br_vlan_get_pvid_rcu(skb->dev, &pvid); if (err) return; if (vid != pvid || skb->vlan_proto != htons(proto)) return; /* The sad part about attempting to untag from DSA is that we * don't know, unless we check, if the skb will end up in * the bridge's data path - br_allowed_ingress() - or not. * For example, there might be an 8021q upper for the * default_pvid of the bridge, which will steal VLAN-tagged traffic * from the bridge's data path. This is a configuration that DSA * supports because vlan_filtering is 0. In that case, we should * definitely keep the tag, to make sure it keeps working. */ upper_dev = __vlan_find_dev_deep_rcu(br, htons(proto), vid); if (!upper_dev) __vlan_hwaccel_clear_tag(skb); } /** * dsa_software_vlan_untag: Software VLAN untagging in DSA receive path * @skb: Pointer to socket buffer (packet) * * Receive path method for switches which send some packets as VLAN-tagged * towards the CPU port (generally from VLAN-aware bridge ports) even when the * packet was not tagged on the wire. Called when ds->untag_bridge_pvid * (legacy) or ds->untag_vlan_aware_bridge_pvid is set to true. * * As a side effect of this method, any VLAN tag from the skb head is moved * to hwaccel. */ static inline struct sk_buff *dsa_software_vlan_untag(struct sk_buff *skb) { struct dsa_port *dp = dsa_user_to_port(skb->dev); struct net_device *br = dsa_port_bridge_dev_get(dp); u16 vid, proto; int err; /* software untagging for standalone ports not yet necessary */ if (!br) return skb; err = br_vlan_get_proto(br, &proto); if (err) return skb; /* Move VLAN tag from data to hwaccel */ if (!skb_vlan_tag_present(skb) && skb->protocol == htons(proto)) { skb = skb_vlan_untag(skb); if (!skb) return NULL; } if (!skb_vlan_tag_present(skb)) return skb; vid = skb_vlan_tag_get_id(skb); if (br_vlan_enabled(br)) { if (dp->ds->untag_vlan_aware_bridge_pvid) dsa_software_untag_vlan_aware_bridge(skb, br, vid); } else { if (dp->ds->untag_bridge_pvid) dsa_software_untag_vlan_unaware_bridge(skb, br, vid); } return skb; } /* For switches without hardware support for DSA tagging to be able * to support termination through the bridge. */ static inline struct net_device * dsa_find_designated_bridge_port_by_vid(struct net_device *conduit, u16 vid) { struct dsa_port *cpu_dp = conduit->dsa_ptr; struct dsa_switch_tree *dst = cpu_dp->dst; struct bridge_vlan_info vinfo; struct net_device *user; struct dsa_port *dp; int err; list_for_each_entry(dp, &dst->ports, list) { if (dp->type != DSA_PORT_TYPE_USER) continue; if (!dp->bridge) continue; if (dp->stp_state != BR_STATE_LEARNING && dp->stp_state != BR_STATE_FORWARDING) continue; /* Since the bridge might learn this packet, keep the CPU port * affinity with the port that will be used for the reply on * xmit. */ if (dp->cpu_dp != cpu_dp) continue; user = dp->user; err = br_vlan_get_info_rcu(user, vid, &vinfo); if (err) continue; return user; } return NULL; } /* If the ingress port offloads the bridge, we mark the frame as autonomously * forwarded by hardware, so the software bridge doesn't forward in twice, back * to us, because we already did. However, if we're in fallback mode and we do * software bridging, we are not offloading it, therefore the dp->bridge * pointer is not populated, and flooding needs to be done by software (we are * effectively operating in standalone ports mode). */ static inline void dsa_default_offload_fwd_mark(struct sk_buff *skb) { struct dsa_port *dp = dsa_user_to_port(skb->dev); skb->offload_fwd_mark = !!(dp->bridge); } /* Helper for removing DSA header tags from packets in the RX path. * Must not be called before skb_pull(len). * skb->data * | * v * | | | | | | | | | | | | | | | | | | | * +-----------------------+-----------------------+---------------+-------+ * | Destination MAC | Source MAC | DSA header | EType | * +-----------------------+-----------------------+---------------+-------+ * | | * <----- len -----> <----- len -----> * | * >>>>>>> v * >>>>>>> | | | | | | | | | | | | | | | * >>>>>>> +-----------------------+-----------------------+-------+ * >>>>>>> | Destination MAC | Source MAC | EType | * +-----------------------+-----------------------+-------+ * ^ * | * skb->data */ static inline void dsa_strip_etype_header(struct sk_buff *skb, int len) { memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - len, 2 * ETH_ALEN); } /* Helper for creating space for DSA header tags in TX path packets. * Must not be called before skb_push(len). * * Before: * * <<<<<<< | | | | | | | | | | | | | | | * ^ <<<<<<< +-----------------------+-----------------------+-------+ * | <<<<<<< | Destination MAC | Source MAC | EType | * | +-----------------------+-----------------------+-------+ * <----- len -----> * | * | * skb->data * * After: * * | | | | | | | | | | | | | | | | | | | * +-----------------------+-----------------------+---------------+-------+ * | Destination MAC | Source MAC | DSA header | EType | * +-----------------------+-----------------------+---------------+-------+ * ^ | | * | <----- len -----> * skb->data */ static inline void dsa_alloc_etype_header(struct sk_buff *skb, int len) { memmove(skb->data, skb->data + len, 2 * ETH_ALEN); } /* On RX, eth_type_trans() on the DSA conduit pulls ETH_HLEN bytes starting from * skb_mac_header(skb), which leaves skb->data pointing at the first byte after * what the DSA conduit perceives as the EtherType (the beginning of the L3 * protocol). Since DSA EtherType header taggers treat the EtherType as part of * the DSA tag itself, and the EtherType is 2 bytes in length, the DSA header * is located 2 bytes behind skb->data. Note that EtherType in this context * means the first 2 bytes of the DSA header, not the encapsulated EtherType * that will become visible after the DSA header is stripped. */ static inline void *dsa_etype_header_pos_rx(struct sk_buff *skb) { return skb->data - 2; } /* On TX, skb->data points to the MAC header, which means that EtherType * header taggers start exactly where the EtherType is (the EtherType is * treated as part of the DSA header). */ static inline void *dsa_etype_header_pos_tx(struct sk_buff *skb) { return skb->data + 2 * ETH_ALEN; } /* Create 2 modaliases per tagging protocol, one to auto-load the module * given the ID reported by get_tag_protocol(), and the other by name. */ #define DSA_TAG_DRIVER_ALIAS "dsa_tag:" #define MODULE_ALIAS_DSA_TAG_DRIVER(__proto, __name) \ MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS __name); \ MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS "id-" \ __stringify(__proto##_VALUE)) void dsa_tag_drivers_register(struct dsa_tag_driver *dsa_tag_driver_array[], unsigned int count, struct module *owner); void dsa_tag_drivers_unregister(struct dsa_tag_driver *dsa_tag_driver_array[], unsigned int count); #define dsa_tag_driver_module_drivers(__dsa_tag_drivers_array, __count) \ static int __init dsa_tag_driver_module_init(void) \ { \ dsa_tag_drivers_register(__dsa_tag_drivers_array, __count, \ THIS_MODULE); \ return 0; \ } \ module_init(dsa_tag_driver_module_init); \ \ static void __exit dsa_tag_driver_module_exit(void) \ { \ dsa_tag_drivers_unregister(__dsa_tag_drivers_array, __count); \ } \ module_exit(dsa_tag_driver_module_exit) /** * module_dsa_tag_drivers() - Helper macro for registering DSA tag * drivers * @__ops_array: Array of tag driver structures * * Helper macro for DSA tag drivers which do not do anything special * in module init/exit. Each module may only use this macro once, and * calling it replaces module_init() and module_exit(). */ #define module_dsa_tag_drivers(__ops_array) \ dsa_tag_driver_module_drivers(__ops_array, ARRAY_SIZE(__ops_array)) #define DSA_TAG_DRIVER_NAME(__ops) dsa_tag_driver ## _ ## __ops /* Create a static structure we can build a linked list of dsa_tag * drivers */ #define DSA_TAG_DRIVER(__ops) \ static struct dsa_tag_driver DSA_TAG_DRIVER_NAME(__ops) = { \ .ops = &__ops, \ } /** * module_dsa_tag_driver() - Helper macro for registering a single DSA tag * driver * @__ops: Single tag driver structures * * Helper macro for DSA tag drivers which do not do anything special * in module init/exit. Each module may only use this macro once, and * calling it replaces module_init() and module_exit(). */ #define module_dsa_tag_driver(__ops) \ DSA_TAG_DRIVER(__ops); \ \ static struct dsa_tag_driver *dsa_tag_driver_array[] = { \ &DSA_TAG_DRIVER_NAME(__ops) \ }; \ module_dsa_tag_drivers(dsa_tag_driver_array) #endif
1965 1841 98 1943 114 17 1935 78 1 3 4 44 14 9 22 2 10 1 116 259 211 92 177 135 60 1 1 17 14 8 5 2 2 19 19 1 19 285 180 98 121 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244