Total coverage: 379836 (19%)of 2051479
4 4 4 1 3 4 4 3 3 3 3 3 3 3 3 3 8 7 4 4 4 4 7 3 7 4 4 4 4 4 4 1 8 8 8 8 8 8 8 1 7 4 6 3 5 8 8 8 8 4 8 3 8 3 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 // SPDX-License-Identifier: GPL-2.0-only /* * Line 6 Linux USB driver * * Copyright (C) 2004-2010 Markus Grabner (line6@grabner-graz.at) */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/usb.h> #include <sound/core.h> #include <sound/initval.h> #include <sound/hwdep.h> #include "capture.h" #include "driver.h" #include "midi.h" #include "playback.h" #define DRIVER_AUTHOR "Markus Grabner <line6@grabner-graz.at>" #define DRIVER_DESC "Line 6 USB Driver" /* This is Line 6's MIDI manufacturer ID. */ const unsigned char line6_midi_id[3] = { 0x00, 0x01, 0x0c }; EXPORT_SYMBOL_GPL(line6_midi_id); /* Code to request version of POD, Variax interface (and maybe other devices). */ static const char line6_request_version[] = { 0xf0, 0x7e, 0x7f, 0x06, 0x01, 0xf7 }; /* Class for asynchronous messages. */ struct message { struct usb_line6 *line6; const char *buffer; int size; int done; }; /* Forward declarations. */ static void line6_data_received(struct urb *urb); static int line6_send_raw_message_async_part(struct message *msg, struct urb *urb); /* Start to listen on endpoint. */ static int line6_start_listen(struct usb_line6 *line6) { int err; if (line6->properties->capabilities & LINE6_CAP_CONTROL_MIDI) { usb_fill_int_urb(line6->urb_listen, line6->usbdev, usb_rcvintpipe(line6->usbdev, line6->properties->ep_ctrl_r), line6->buffer_listen, LINE6_BUFSIZE_LISTEN, line6_data_received, line6, line6->interval); } else { usb_fill_bulk_urb(line6->urb_listen, line6->usbdev, usb_rcvbulkpipe(line6->usbdev, line6->properties->ep_ctrl_r), line6->buffer_listen, LINE6_BUFSIZE_LISTEN, line6_data_received, line6); } /* sanity checks of EP before actually submitting */ if (usb_urb_ep_type_check(line6->urb_listen)) { dev_err(line6->ifcdev, "invalid control EP\n"); return -EINVAL; } line6->urb_listen->actual_length = 0; err = usb_submit_urb(line6->urb_listen, GFP_ATOMIC); return err; } /* Stop listening on endpoint. */ static void line6_stop_listen(struct usb_line6 *line6) { usb_kill_urb(line6->urb_listen); } /* Send raw message in pieces of wMaxPacketSize bytes. */ int line6_send_raw_message(struct usb_line6 *line6, const char *buffer, int size) { int i, done = 0; const struct line6_properties *properties = line6->properties; for (i = 0; i < size; i += line6->max_packet_size) { int partial; const char *frag_buf = buffer + i; int frag_size = min(line6->max_packet_size, size - i); int retval; if (properties->capabilities & LINE6_CAP_CONTROL_MIDI) { retval = usb_interrupt_msg(line6->usbdev, usb_sndintpipe(line6->usbdev, properties->ep_ctrl_w), (char *)frag_buf, frag_size, &partial, LINE6_TIMEOUT); } else { retval = usb_bulk_msg(line6->usbdev, usb_sndbulkpipe(line6->usbdev, properties->ep_ctrl_w), (char *)frag_buf, frag_size, &partial, LINE6_TIMEOUT); } if (retval) { dev_err(line6->ifcdev, "usb_bulk_msg failed (%d)\n", retval); break; } done += frag_size; } return done; } EXPORT_SYMBOL_GPL(line6_send_raw_message); /* Notification of completion of asynchronous request transmission. */ static void line6_async_request_sent(struct urb *urb) { struct message *msg = (struct message *)urb->context; if (msg->done >= msg->size) { usb_free_urb(urb); kfree(msg); } else line6_send_raw_message_async_part(msg, urb); } /* Asynchronously send part of a raw message. */ static int line6_send_raw_message_async_part(struct message *msg, struct urb *urb) { int retval; struct usb_line6 *line6 = msg->line6; int done = msg->done; int bytes = min(msg->size - done, line6->max_packet_size); if (line6->properties->capabilities & LINE6_CAP_CONTROL_MIDI) { usb_fill_int_urb(urb, line6->usbdev, usb_sndintpipe(line6->usbdev, line6->properties->ep_ctrl_w), (char *)msg->buffer + done, bytes, line6_async_request_sent, msg, line6->interval); } else { usb_fill_bulk_urb(urb, line6->usbdev, usb_sndbulkpipe(line6->usbdev, line6->properties->ep_ctrl_w), (char *)msg->buffer + done, bytes, line6_async_request_sent, msg); } msg->done += bytes; /* sanity checks of EP before actually submitting */ retval = usb_urb_ep_type_check(urb); if (retval < 0) goto error; retval = usb_submit_urb(urb, GFP_ATOMIC); if (retval < 0) goto error; return 0; error: dev_err(line6->ifcdev, "%s: usb_submit_urb failed (%d)\n", __func__, retval); usb_free_urb(urb); kfree(msg); return retval; } /* Asynchronously send raw message. */ int line6_send_raw_message_async(struct usb_line6 *line6, const char *buffer, int size) { struct message *msg; struct urb *urb; /* create message: */ msg = kzalloc(sizeof(struct message), GFP_ATOMIC); if (msg == NULL) return -ENOMEM; /* create URB: */ urb = usb_alloc_urb(0, GFP_ATOMIC); if (urb == NULL) { kfree(msg); return -ENOMEM; } /* set message data: */ msg->line6 = line6; msg->buffer = buffer; msg->size = size; msg->done = 0; /* start sending: */ return line6_send_raw_message_async_part(msg, urb); } EXPORT_SYMBOL_GPL(line6_send_raw_message_async); /* Send asynchronous device version request. */ int line6_version_request_async(struct usb_line6 *line6) { char *buffer; int retval; buffer = kmemdup(line6_request_version, sizeof(line6_request_version), GFP_ATOMIC); if (buffer == NULL) return -ENOMEM; retval = line6_send_raw_message_async(line6, buffer, sizeof(line6_request_version)); kfree(buffer); return retval; } EXPORT_SYMBOL_GPL(line6_version_request_async); /* Send sysex message in pieces of wMaxPacketSize bytes. */ int line6_send_sysex_message(struct usb_line6 *line6, const char *buffer, int size) { return line6_send_raw_message(line6, buffer, size + SYSEX_EXTRA_SIZE) - SYSEX_EXTRA_SIZE; } EXPORT_SYMBOL_GPL(line6_send_sysex_message); /* Allocate buffer for sysex message and prepare header. @param code sysex message code @param size number of bytes between code and sysex end */ char *line6_alloc_sysex_buffer(struct usb_line6 *line6, int code1, int code2, int size) { char *buffer = kmalloc(size + SYSEX_EXTRA_SIZE, GFP_ATOMIC); if (!buffer) return NULL; buffer[0] = LINE6_SYSEX_BEGIN; memcpy(buffer + 1, line6_midi_id, sizeof(line6_midi_id)); buffer[sizeof(line6_midi_id) + 1] = code1; buffer[sizeof(line6_midi_id) + 2] = code2; buffer[sizeof(line6_midi_id) + 3 + size] = LINE6_SYSEX_END; return buffer; } EXPORT_SYMBOL_GPL(line6_alloc_sysex_buffer); /* Notification of data received from the Line 6 device. */ static void line6_data_received(struct urb *urb) { struct usb_line6 *line6 = (struct usb_line6 *)urb->context; struct midi_buffer *mb = &line6->line6midi->midibuf_in; unsigned long flags; int done; if (urb->status == -ESHUTDOWN) return; if (line6->properties->capabilities & LINE6_CAP_CONTROL_MIDI) { spin_lock_irqsave(&line6->line6midi->lock, flags); done = line6_midibuf_write(mb, urb->transfer_buffer, urb->actual_length); if (done < urb->actual_length) { line6_midibuf_ignore(mb, done); dev_dbg(line6->ifcdev, "%d %d buffer overflow - message skipped\n", done, urb->actual_length); } spin_unlock_irqrestore(&line6->line6midi->lock, flags); for (;;) { spin_lock_irqsave(&line6->line6midi->lock, flags); done = line6_midibuf_read(mb, line6->buffer_message, LINE6_MIDI_MESSAGE_MAXLEN, LINE6_MIDIBUF_READ_RX); spin_unlock_irqrestore(&line6->line6midi->lock, flags); if (done <= 0) break; line6->message_length = done; line6_midi_receive(line6, line6->buffer_message, done); if (line6->process_message) line6->process_message(line6); } } else { line6->buffer_message = urb->transfer_buffer; line6->message_length = urb->actual_length; if (line6->process_message) line6->process_message(line6); line6->buffer_message = NULL; } line6_start_listen(line6); } #define LINE6_READ_WRITE_STATUS_DELAY 2 /* milliseconds */ #define LINE6_READ_WRITE_MAX_RETRIES 50 /* Read data from device. */ int line6_read_data(struct usb_line6 *line6, unsigned address, void *data, unsigned datalen) { struct usb_device *usbdev = line6->usbdev; int ret; u8 len; unsigned count; if (address > 0xffff || datalen > 0xff) return -EINVAL; /* query the serial number: */ ret = usb_control_msg_send(usbdev, 0, 0x67, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, (datalen << 8) | 0x21, address, NULL, 0, LINE6_TIMEOUT, GFP_KERNEL); if (ret) { dev_err(line6->ifcdev, "read request failed (error %d)\n", ret); goto exit; } /* Wait for data length. We'll get 0xff until length arrives. */ for (count = 0; count < LINE6_READ_WRITE_MAX_RETRIES; count++) { mdelay(LINE6_READ_WRITE_STATUS_DELAY); ret = usb_control_msg_recv(usbdev, 0, 0x67, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, 0x0012, 0x0000, &len, 1, LINE6_TIMEOUT, GFP_KERNEL); if (ret) { dev_err(line6->ifcdev, "receive length failed (error %d)\n", ret); goto exit; } if (len != 0xff) break; } ret = -EIO; if (len == 0xff) { dev_err(line6->ifcdev, "read failed after %d retries\n", count); goto exit; } else if (len != datalen) { /* should be equal or something went wrong */ dev_err(line6->ifcdev, "length mismatch (expected %d, got %d)\n", (int)datalen, len); goto exit; } /* receive the result: */ ret = usb_control_msg_recv(usbdev, 0, 0x67, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, 0x0013, 0x0000, data, datalen, LINE6_TIMEOUT, GFP_KERNEL); if (ret) dev_err(line6->ifcdev, "read failed (error %d)\n", ret); exit: return ret; } EXPORT_SYMBOL_GPL(line6_read_data); /* Write data to device. */ int line6_write_data(struct usb_line6 *line6, unsigned address, void *data, unsigned datalen) { struct usb_device *usbdev = line6->usbdev; int ret; unsigned char *status; int count; if (address > 0xffff || datalen > 0xffff) return -EINVAL; status = kmalloc(1, GFP_KERNEL); if (!status) return -ENOMEM; ret = usb_control_msg_send(usbdev, 0, 0x67, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, 0x0022, address, data, datalen, LINE6_TIMEOUT, GFP_KERNEL); if (ret) { dev_err(line6->ifcdev, "write request failed (error %d)\n", ret); goto exit; } for (count = 0; count < LINE6_READ_WRITE_MAX_RETRIES; count++) { mdelay(LINE6_READ_WRITE_STATUS_DELAY); ret = usb_control_msg_recv(usbdev, 0, 0x67, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, 0x0012, 0x0000, status, 1, LINE6_TIMEOUT, GFP_KERNEL); if (ret) { dev_err(line6->ifcdev, "receiving status failed (error %d)\n", ret); goto exit; } if (*status != 0xff) break; } if (*status == 0xff) { dev_err(line6->ifcdev, "write failed after %d retries\n", count); ret = -EIO; } else if (*status != 0) { dev_err(line6->ifcdev, "write failed (error %d)\n", ret); ret = -EIO; } exit: kfree(status); return ret; } EXPORT_SYMBOL_GPL(line6_write_data); /* Read Line 6 device serial number. (POD, TonePort, GuitarPort) */ int line6_read_serial_number(struct usb_line6 *line6, u32 *serial_number) { return line6_read_data(line6, 0x80d0, serial_number, sizeof(*serial_number)); } EXPORT_SYMBOL_GPL(line6_read_serial_number); /* Card destructor. */ static void line6_destruct(struct snd_card *card) { struct usb_line6 *line6 = card->private_data; struct usb_device *usbdev = line6->usbdev; /* Free buffer memory first. We cannot depend on the existence of private * data from the (podhd) module, it may be gone already during this call */ kfree(line6->buffer_message); kfree(line6->buffer_listen); /* then free URBs: */ usb_free_urb(line6->urb_listen); line6->urb_listen = NULL; /* decrement reference counters: */ usb_put_dev(usbdev); } static void line6_get_usb_properties(struct usb_line6 *line6) { struct usb_device *usbdev = line6->usbdev; const struct line6_properties *properties = line6->properties; int pipe; struct usb_host_endpoint *ep = NULL; if (properties->capabilities & LINE6_CAP_CONTROL) { if (properties->capabilities & LINE6_CAP_CONTROL_MIDI) { pipe = usb_rcvintpipe(line6->usbdev, line6->properties->ep_ctrl_r); } else { pipe = usb_rcvbulkpipe(line6->usbdev, line6->properties->ep_ctrl_r); } ep = usbdev->ep_in[usb_pipeendpoint(pipe)]; } /* Control data transfer properties */ if (ep) { line6->interval = ep->desc.bInterval; line6->max_packet_size = le16_to_cpu(ep->desc.wMaxPacketSize); } else { if (properties->capabilities & LINE6_CAP_CONTROL) { dev_err(line6->ifcdev, "endpoint not available, using fallback values"); } line6->interval = LINE6_FALLBACK_INTERVAL; line6->max_packet_size = LINE6_FALLBACK_MAXPACKETSIZE; } /* Isochronous transfer properties */ if (usbdev->speed == USB_SPEED_LOW) { line6->intervals_per_second = USB_LOW_INTERVALS_PER_SECOND; line6->iso_buffers = USB_LOW_ISO_BUFFERS; } else { line6->intervals_per_second = USB_HIGH_INTERVALS_PER_SECOND; line6->iso_buffers = USB_HIGH_ISO_BUFFERS; } } /* Enable buffering of incoming messages, flush the buffer */ static int line6_hwdep_open(struct snd_hwdep *hw, struct file *file) { struct usb_line6 *line6 = hw->private_data; /* NOTE: hwdep layer provides atomicity here */ line6->messages.active = 1; line6->messages.nonblock = file->f_flags & O_NONBLOCK ? 1 : 0; return 0; } /* Stop buffering */ static int line6_hwdep_release(struct snd_hwdep *hw, struct file *file) { struct usb_line6 *line6 = hw->private_data; line6->messages.active = 0; return 0; } /* Read from circular buffer, return to user */ static long line6_hwdep_read(struct snd_hwdep *hwdep, char __user *buf, long count, loff_t *offset) { struct usb_line6 *line6 = hwdep->private_data; long rv = 0; unsigned int out_count; if (mutex_lock_interruptible(&line6->messages.read_lock)) return -ERESTARTSYS; while (kfifo_len(&line6->messages.fifo) == 0) { mutex_unlock(&line6->messages.read_lock); if (line6->messages.nonblock) return -EAGAIN; rv = wait_event_interruptible( line6->messages.wait_queue, kfifo_len(&line6->messages.fifo) != 0); if (rv < 0) return rv; if (mutex_lock_interruptible(&line6->messages.read_lock)) return -ERESTARTSYS; } if (kfifo_peek_len(&line6->messages.fifo) > count) { /* Buffer too small; allow re-read of the current item... */ rv = -EINVAL; } else { rv = kfifo_to_user(&line6->messages.fifo, buf, count, &out_count); if (rv == 0) rv = out_count; } mutex_unlock(&line6->messages.read_lock); return rv; } /* Write directly (no buffering) to device by user*/ static long line6_hwdep_write(struct snd_hwdep *hwdep, const char __user *data, long count, loff_t *offset) { struct usb_line6 *line6 = hwdep->private_data; int rv; char *data_copy; if (count > line6->max_packet_size * LINE6_RAW_MESSAGES_MAXCOUNT) { /* This is an arbitrary limit - still better than nothing... */ return -EINVAL; } data_copy = memdup_user(data, count); if (IS_ERR(data_copy)) return PTR_ERR(data_copy); rv = line6_send_raw_message(line6, data_copy, count); kfree(data_copy); return rv; } static __poll_t line6_hwdep_poll(struct snd_hwdep *hwdep, struct file *file, poll_table *wait) { __poll_t rv; struct usb_line6 *line6 = hwdep->private_data; poll_wait(file, &line6->messages.wait_queue, wait); mutex_lock(&line6->messages.read_lock); rv = kfifo_len(&line6->messages.fifo) == 0 ? 0 : EPOLLIN | EPOLLRDNORM; mutex_unlock(&line6->messages.read_lock); return rv; } static const struct snd_hwdep_ops hwdep_ops = { .open = line6_hwdep_open, .release = line6_hwdep_release, .read = line6_hwdep_read, .write = line6_hwdep_write, .poll = line6_hwdep_poll, }; /* Insert into circular buffer */ static void line6_hwdep_push_message(struct usb_line6 *line6) { if (!line6->messages.active) return; if (kfifo_avail(&line6->messages.fifo) >= line6->message_length) { /* No race condition here, there's only one writer */ kfifo_in(&line6->messages.fifo, line6->buffer_message, line6->message_length); } /* else TODO: signal overflow */ wake_up_interruptible(&line6->messages.wait_queue); } static int line6_hwdep_init(struct usb_line6 *line6) { int err; struct snd_hwdep *hwdep; /* TODO: usb_driver_claim_interface(); */ line6->process_message = line6_hwdep_push_message; line6->messages.active = 0; init_waitqueue_head(&line6->messages.wait_queue); mutex_init(&line6->messages.read_lock); INIT_KFIFO(line6->messages.fifo); err = snd_hwdep_new(line6->card, "config", 0, &hwdep); if (err < 0) goto end; strscpy(hwdep->name, "config"); hwdep->iface = SNDRV_HWDEP_IFACE_LINE6; hwdep->ops = hwdep_ops; hwdep->private_data = line6; hwdep->exclusive = true; end: return err; } static int line6_init_cap_control(struct usb_line6 *line6) { int ret; /* initialize USB buffers: */ line6->buffer_listen = kzalloc(LINE6_BUFSIZE_LISTEN, GFP_KERNEL); if (!line6->buffer_listen) return -ENOMEM; line6->urb_listen = usb_alloc_urb(0, GFP_KERNEL); if (!line6->urb_listen) return -ENOMEM; if (line6->properties->capabilities & LINE6_CAP_CONTROL_MIDI) { line6->buffer_message = kzalloc(LINE6_MIDI_MESSAGE_MAXLEN, GFP_KERNEL); if (!line6->buffer_message) return -ENOMEM; ret = line6_init_midi(line6); if (ret < 0) return ret; } else { ret = line6_hwdep_init(line6); if (ret < 0) return ret; } ret = line6_start_listen(line6); if (ret < 0) { dev_err(line6->ifcdev, "cannot start listening: %d\n", ret); return ret; } return 0; } static void line6_startup_work(struct work_struct *work) { struct usb_line6 *line6 = container_of(work, struct usb_line6, startup_work.work); if (line6->startup) line6->startup(line6); } /* Probe USB device. */ int line6_probe(struct usb_interface *interface, const struct usb_device_id *id, const char *driver_name, const struct line6_properties *properties, int (*private_init)(struct usb_line6 *, const struct usb_device_id *id), size_t data_size) { struct usb_device *usbdev = interface_to_usbdev(interface); struct snd_card *card; struct usb_line6 *line6; int interface_number; int ret; if (WARN_ON(data_size < sizeof(*line6))) return -EINVAL; /* we don't handle multiple configurations */ if (usbdev->descriptor.bNumConfigurations != 1) return -ENODEV; ret = snd_card_new(&interface->dev, SNDRV_DEFAULT_IDX1, SNDRV_DEFAULT_STR1, THIS_MODULE, data_size, &card); if (ret < 0) return ret; /* store basic data: */ line6 = card->private_data; line6->card = card; line6->properties = properties; line6->usbdev = usbdev; line6->ifcdev = &interface->dev; INIT_DELAYED_WORK(&line6->startup_work, line6_startup_work); strscpy(card->id, properties->id); strscpy(card->driver, driver_name); strscpy(card->shortname, properties->name); sprintf(card->longname, "Line 6 %s at USB %s", properties->name, dev_name(line6->ifcdev)); card->private_free = line6_destruct; usb_set_intfdata(interface, line6); /* increment reference counters: */ usb_get_dev(usbdev); /* initialize device info: */ dev_info(&interface->dev, "Line 6 %s found\n", properties->name); /* query interface number */ interface_number = interface->cur_altsetting->desc.bInterfaceNumber; /* TODO reserves the bus bandwidth even without actual transfer */ ret = usb_set_interface(usbdev, interface_number, properties->altsetting); if (ret < 0) { dev_err(&interface->dev, "set_interface failed\n"); goto error; } line6_get_usb_properties(line6); if (properties->capabilities & LINE6_CAP_CONTROL) { ret = line6_init_cap_control(line6); if (ret < 0) goto error; } /* initialize device data based on device: */ ret = private_init(line6, id); if (ret < 0) goto error; /* creation of additional special files should go here */ dev_info(&interface->dev, "Line 6 %s now attached\n", properties->name); return 0; error: /* we can call disconnect callback here because no close-sync is * needed yet at this point */ line6_disconnect(interface); return ret; } EXPORT_SYMBOL_GPL(line6_probe); /* Line 6 device disconnected. */ void line6_disconnect(struct usb_interface *interface) { struct usb_line6 *line6 = usb_get_intfdata(interface); struct usb_device *usbdev = interface_to_usbdev(interface); if (!line6) return; if (WARN_ON(usbdev != line6->usbdev)) return; cancel_delayed_work_sync(&line6->startup_work); if (line6->urb_listen != NULL) line6_stop_listen(line6); snd_card_disconnect(line6->card); if (line6->line6pcm) line6_pcm_disconnect(line6->line6pcm); if (line6->disconnect) line6->disconnect(line6); dev_info(&interface->dev, "Line 6 %s now disconnected\n", line6->properties->name); /* make sure the device isn't destructed twice: */ usb_set_intfdata(interface, NULL); snd_card_free_when_closed(line6->card); } EXPORT_SYMBOL_GPL(line6_disconnect); #ifdef CONFIG_PM /* Suspend Line 6 device. */ int line6_suspend(struct usb_interface *interface, pm_message_t message) { struct usb_line6 *line6 = usb_get_intfdata(interface); struct snd_line6_pcm *line6pcm = line6->line6pcm; snd_power_change_state(line6->card, SNDRV_CTL_POWER_D3hot); if (line6->properties->capabilities & LINE6_CAP_CONTROL) line6_stop_listen(line6); if (line6pcm != NULL) line6pcm->flags = 0; return 0; } EXPORT_SYMBOL_GPL(line6_suspend); /* Resume Line 6 device. */ int line6_resume(struct usb_interface *interface) { struct usb_line6 *line6 = usb_get_intfdata(interface); if (line6->properties->capabilities & LINE6_CAP_CONTROL) line6_start_listen(line6); snd_power_change_state(line6->card, SNDRV_CTL_POWER_D0); return 0; } EXPORT_SYMBOL_GPL(line6_resume); #endif /* CONFIG_PM */ MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL");
32 32 32 2 23 4 1 16 22 22 3 1 18 1 18 19 5231 1049 870 2716 9 7472 7 4616 26 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_H #define _LINUX_SCHED_H /* * Define 'struct task_struct' and provide the main scheduler * APIs (schedule(), wakeup variants, etc.) */ #include <uapi/linux/sched.h> #include <asm/current.h> #include <asm/processor.h> #include <linux/thread_info.h> #include <linux/preempt.h> #include <linux/cpumask_types.h> #include <linux/cache.h> #include <linux/irqflags_types.h> #include <linux/smp_types.h> #include <linux/pid_types.h> #include <linux/sem_types.h> #include <linux/shm.h> #include <linux/kmsan_types.h> #include <linux/mutex_types.h> #include <linux/plist_types.h> #include <linux/hrtimer_types.h> #include <linux/timer_types.h> #include <linux/seccomp_types.h> #include <linux/nodemask_types.h> #include <linux/refcount_types.h> #include <linux/resource.h> #include <linux/latencytop.h> #include <linux/sched/prio.h> #include <linux/sched/types.h> #include <linux/signal_types.h> #include <linux/spinlock.h> #include <linux/syscall_user_dispatch_types.h> #include <linux/mm_types_task.h> #include <linux/netdevice_xmit.h> #include <linux/task_io_accounting.h> #include <linux/posix-timers_types.h> #include <linux/restart_block.h> #include <uapi/linux/rseq.h> #include <linux/seqlock_types.h> #include <linux/kcsan.h> #include <linux/rv.h> #include <linux/uidgid_types.h> #include <linux/tracepoint-defs.h> #include <linux/unwind_deferred_types.h> #include <asm/kmap_size.h> /* task_struct member predeclarations (sorted alphabetically): */ struct audit_context; struct bio_list; struct blk_plug; struct bpf_local_storage; struct bpf_run_ctx; struct bpf_net_context; struct capture_control; struct cfs_rq; struct fs_struct; struct futex_pi_state; struct io_context; struct io_uring_task; struct mempolicy; struct nameidata; struct nsproxy; struct perf_event_context; struct perf_ctx_data; struct pid_namespace; struct pipe_inode_info; struct rcu_node; struct reclaim_state; struct robust_list_head; struct root_domain; struct rq; struct sched_attr; struct sched_dl_entity; struct seq_file; struct sighand_struct; struct signal_struct; struct task_delay_info; struct task_group; struct task_struct; struct user_event_mm; #include <linux/sched/ext.h> /* * Task state bitmask. NOTE! These bits are also * encoded in fs/proc/array.c: get_task_state(). * * We have two separate sets of flags: task->__state * is about runnability, while task->exit_state are * about the task exiting. Confusing, but this way * modifying one set can't modify the other one by * mistake. */ /* Used in tsk->__state: */ #define TASK_RUNNING 0x00000000 #define TASK_INTERRUPTIBLE 0x00000001 #define TASK_UNINTERRUPTIBLE 0x00000002 #define __TASK_STOPPED 0x00000004 #define __TASK_TRACED 0x00000008 /* Used in tsk->exit_state: */ #define EXIT_DEAD 0x00000010 #define EXIT_ZOMBIE 0x00000020 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) /* Used in tsk->__state again: */ #define TASK_PARKED 0x00000040 #define TASK_DEAD 0x00000080 #define TASK_WAKEKILL 0x00000100 #define TASK_WAKING 0x00000200 #define TASK_NOLOAD 0x00000400 #define TASK_NEW 0x00000800 #define TASK_RTLOCK_WAIT 0x00001000 #define TASK_FREEZABLE 0x00002000 #define __TASK_FREEZABLE_UNSAFE (0x00004000 * IS_ENABLED(CONFIG_LOCKDEP)) #define TASK_FROZEN 0x00008000 #define TASK_STATE_MAX 0x00010000 #define TASK_ANY (TASK_STATE_MAX-1) /* * DO NOT ADD ANY NEW USERS ! */ #define TASK_FREEZABLE_UNSAFE (TASK_FREEZABLE | __TASK_FREEZABLE_UNSAFE) /* Convenience macros for the sake of set_current_state: */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) #define TASK_TRACED __TASK_TRACED #define TASK_IDLE (TASK_UNINTERRUPTIBLE | TASK_NOLOAD) /* Convenience macros for the sake of wake_up(): */ #define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) /* get_task_state(): */ #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ TASK_PARKED) #define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING) #define task_is_traced(task) ((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0) #define task_is_stopped(task) ((READ_ONCE(task->jobctl) & JOBCTL_STOPPED) != 0) #define task_is_stopped_or_traced(task) ((READ_ONCE(task->jobctl) & (JOBCTL_STOPPED | JOBCTL_TRACED)) != 0) /* * Special states are those that do not use the normal wait-loop pattern. See * the comment with set_special_state(). */ #define is_special_task_state(state) \ ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | \ TASK_DEAD | TASK_FROZEN)) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP # define debug_normal_state_change(state_value) \ do { \ WARN_ON_ONCE(is_special_task_state(state_value)); \ current->task_state_change = _THIS_IP_; \ } while (0) # define debug_special_state_change(state_value) \ do { \ WARN_ON_ONCE(!is_special_task_state(state_value)); \ current->task_state_change = _THIS_IP_; \ } while (0) # define debug_rtlock_wait_set_state() \ do { \ current->saved_state_change = current->task_state_change;\ current->task_state_change = _THIS_IP_; \ } while (0) # define debug_rtlock_wait_restore_state() \ do { \ current->task_state_change = current->saved_state_change;\ } while (0) #else # define debug_normal_state_change(cond) do { } while (0) # define debug_special_state_change(cond) do { } while (0) # define debug_rtlock_wait_set_state() do { } while (0) # define debug_rtlock_wait_restore_state() do { } while (0) #endif #define trace_set_current_state(state_value) \ do { \ if (tracepoint_enabled(sched_set_state_tp)) \ __trace_set_current_state(state_value); \ } while (0) /* * set_current_state() includes a barrier so that the write of current->__state * is correctly serialised wrt the caller's subsequent test of whether to * actually sleep: * * for (;;) { * set_current_state(TASK_UNINTERRUPTIBLE); * if (CONDITION) * break; * * schedule(); * } * __set_current_state(TASK_RUNNING); * * If the caller does not need such serialisation (because, for instance, the * CONDITION test and condition change and wakeup are under the same lock) then * use __set_current_state(). * * The above is typically ordered against the wakeup, which does: * * CONDITION = 1; * wake_up_state(p, TASK_UNINTERRUPTIBLE); * * where wake_up_state()/try_to_wake_up() executes a full memory barrier before * accessing p->__state. * * Wakeup will do: if (@state & p->__state) p->__state = TASK_RUNNING, that is, * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING). * * However, with slightly different timing the wakeup TASK_RUNNING store can * also collide with the TASK_UNINTERRUPTIBLE store. Losing that store is not * a problem either because that will result in one extra go around the loop * and our @cond test will save the day. * * Also see the comments of try_to_wake_up(). */ #define __set_current_state(state_value) \ do { \ debug_normal_state_change((state_value)); \ trace_set_current_state(state_value); \ WRITE_ONCE(current->__state, (state_value)); \ } while (0) #define set_current_state(state_value) \ do { \ debug_normal_state_change((state_value)); \ trace_set_current_state(state_value); \ smp_store_mb(current->__state, (state_value)); \ } while (0) /* * set_special_state() should be used for those states when the blocking task * can not use the regular condition based wait-loop. In that case we must * serialize against wakeups such that any possible in-flight TASK_RUNNING * stores will not collide with our state change. */ #define set_special_state(state_value) \ do { \ unsigned long flags; /* may shadow */ \ \ raw_spin_lock_irqsave(&current->pi_lock, flags); \ debug_special_state_change((state_value)); \ trace_set_current_state(state_value); \ WRITE_ONCE(current->__state, (state_value)); \ raw_spin_unlock_irqrestore(&current->pi_lock, flags); \ } while (0) /* * PREEMPT_RT specific variants for "sleeping" spin/rwlocks * * RT's spin/rwlock substitutions are state preserving. The state of the * task when blocking on the lock is saved in task_struct::saved_state and * restored after the lock has been acquired. These operations are * serialized by task_struct::pi_lock against try_to_wake_up(). Any non RT * lock related wakeups while the task is blocked on the lock are * redirected to operate on task_struct::saved_state to ensure that these * are not dropped. On restore task_struct::saved_state is set to * TASK_RUNNING so any wakeup attempt redirected to saved_state will fail. * * The lock operation looks like this: * * current_save_and_set_rtlock_wait_state(); * for (;;) { * if (try_lock()) * break; * raw_spin_unlock_irq(&lock->wait_lock); * schedule_rtlock(); * raw_spin_lock_irq(&lock->wait_lock); * set_current_state(TASK_RTLOCK_WAIT); * } * current_restore_rtlock_saved_state(); */ #define current_save_and_set_rtlock_wait_state() \ do { \ lockdep_assert_irqs_disabled(); \ raw_spin_lock(&current->pi_lock); \ current->saved_state = current->__state; \ debug_rtlock_wait_set_state(); \ trace_set_current_state(TASK_RTLOCK_WAIT); \ WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT); \ raw_spin_unlock(&current->pi_lock); \ } while (0); #define current_restore_rtlock_saved_state() \ do { \ lockdep_assert_irqs_disabled(); \ raw_spin_lock(&current->pi_lock); \ debug_rtlock_wait_restore_state(); \ trace_set_current_state(current->saved_state); \ WRITE_ONCE(current->__state, current->saved_state); \ current->saved_state = TASK_RUNNING; \ raw_spin_unlock(&current->pi_lock); \ } while (0); #define get_current_state() READ_ONCE(current->__state) /* * Define the task command name length as enum, then it can be visible to * BPF programs. */ enum { TASK_COMM_LEN = 16, }; extern void sched_tick(void); #define MAX_SCHEDULE_TIMEOUT LONG_MAX extern long schedule_timeout(long timeout); extern long schedule_timeout_interruptible(long timeout); extern long schedule_timeout_killable(long timeout); extern long schedule_timeout_uninterruptible(long timeout); extern long schedule_timeout_idle(long timeout); asmlinkage void schedule(void); extern void schedule_preempt_disabled(void); asmlinkage void preempt_schedule_irq(void); #ifdef CONFIG_PREEMPT_RT extern void schedule_rtlock(void); #endif extern int __must_check io_schedule_prepare(void); extern void io_schedule_finish(int token); extern long io_schedule_timeout(long timeout); extern void io_schedule(void); /* wrapper functions to trace from this header file */ DECLARE_TRACEPOINT(sched_set_state_tp); extern void __trace_set_current_state(int state_value); DECLARE_TRACEPOINT(sched_set_need_resched_tp); extern void __trace_set_need_resched(struct task_struct *curr, int tif); /** * struct prev_cputime - snapshot of system and user cputime * @utime: time spent in user mode * @stime: time spent in system mode * @lock: protects the above two fields * * Stores previous user/system time values such that we can guarantee * monotonicity. */ struct prev_cputime { #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE u64 utime; u64 stime; raw_spinlock_t lock; #endif }; enum vtime_state { /* Task is sleeping or running in a CPU with VTIME inactive: */ VTIME_INACTIVE = 0, /* Task is idle */ VTIME_IDLE, /* Task runs in kernelspace in a CPU with VTIME active: */ VTIME_SYS, /* Task runs in userspace in a CPU with VTIME active: */ VTIME_USER, /* Task runs as guests in a CPU with VTIME active: */ VTIME_GUEST, }; struct vtime { seqcount_t seqcount; unsigned long long starttime; enum vtime_state state; unsigned int cpu; u64 utime; u64 stime; u64 gtime; }; /* * Utilization clamp constraints. * @UCLAMP_MIN: Minimum utilization * @UCLAMP_MAX: Maximum utilization * @UCLAMP_CNT: Utilization clamp constraints count */ enum uclamp_id { UCLAMP_MIN = 0, UCLAMP_MAX, UCLAMP_CNT }; extern struct root_domain def_root_domain; extern struct mutex sched_domains_mutex; extern void sched_domains_mutex_lock(void); extern void sched_domains_mutex_unlock(void); struct sched_param { int sched_priority; }; struct sched_info { #ifdef CONFIG_SCHED_INFO /* Cumulative counters: */ /* # of times we have run on this CPU: */ unsigned long pcount; /* Time spent waiting on a runqueue: */ unsigned long long run_delay; /* Max time spent waiting on a runqueue: */ unsigned long long max_run_delay; /* Min time spent waiting on a runqueue: */ unsigned long long min_run_delay; /* Timestamps: */ /* When did we last run on a CPU? */ unsigned long long last_arrival; /* When were we last queued to run? */ unsigned long long last_queued; #endif /* CONFIG_SCHED_INFO */ }; /* * Integer metrics need fixed point arithmetic, e.g., sched/fair * has a few: load, load_avg, util_avg, freq, and capacity. * * We define a basic fixed point arithmetic range, and then formalize * all these metrics based on that basic range. */ # define SCHED_FIXEDPOINT_SHIFT 10 # define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT) /* Increase resolution of cpu_capacity calculations */ # define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT # define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) struct load_weight { unsigned long weight; u32 inv_weight; }; /* * The load/runnable/util_avg accumulates an infinite geometric series * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c). * * [load_avg definition] * * load_avg = runnable% * scale_load_down(load) * * [runnable_avg definition] * * runnable_avg = runnable% * SCHED_CAPACITY_SCALE * * [util_avg definition] * * util_avg = running% * SCHED_CAPACITY_SCALE * * where runnable% is the time ratio that a sched_entity is runnable and * running% the time ratio that a sched_entity is running. * * For cfs_rq, they are the aggregated values of all runnable and blocked * sched_entities. * * The load/runnable/util_avg doesn't directly factor frequency scaling and CPU * capacity scaling. The scaling is done through the rq_clock_pelt that is used * for computing those signals (see update_rq_clock_pelt()) * * N.B., the above ratios (runnable% and running%) themselves are in the * range of [0, 1]. To do fixed point arithmetics, we therefore scale them * to as large a range as necessary. This is for example reflected by * util_avg's SCHED_CAPACITY_SCALE. * * [Overflow issue] * * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities * with the highest load (=88761), always runnable on a single cfs_rq, * and should not overflow as the number already hits PID_MAX_LIMIT. * * For all other cases (including 32-bit kernels), struct load_weight's * weight will overflow first before we do, because: * * Max(load_avg) <= Max(load.weight) * * Then it is the load_weight's responsibility to consider overflow * issues. */ struct sched_avg { u64 last_update_time; u64 load_sum; u64 runnable_sum; u32 util_sum; u32 period_contrib; unsigned long load_avg; unsigned long runnable_avg; unsigned long util_avg; unsigned int util_est; } ____cacheline_aligned; /* * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg * updates. When a task is dequeued, its util_est should not be updated if its * util_avg has not been updated in the meantime. * This information is mapped into the MSB bit of util_est at dequeue time. * Since max value of util_est for a task is 1024 (PELT util_avg for a task) * it is safe to use MSB. */ #define UTIL_EST_WEIGHT_SHIFT 2 #define UTIL_AVG_UNCHANGED 0x80000000 struct sched_statistics { #ifdef CONFIG_SCHEDSTATS u64 wait_start; u64 wait_max; u64 wait_count; u64 wait_sum; u64 iowait_count; u64 iowait_sum; u64 sleep_start; u64 sleep_max; s64 sum_sleep_runtime; u64 block_start; u64 block_max; s64 sum_block_runtime; s64 exec_max; u64 slice_max; u64 nr_migrations_cold; u64 nr_failed_migrations_affine; u64 nr_failed_migrations_running; u64 nr_failed_migrations_hot; u64 nr_forced_migrations; u64 nr_wakeups; u64 nr_wakeups_sync; u64 nr_wakeups_migrate; u64 nr_wakeups_local; u64 nr_wakeups_remote; u64 nr_wakeups_affine; u64 nr_wakeups_affine_attempts; u64 nr_wakeups_passive; u64 nr_wakeups_idle; #ifdef CONFIG_SCHED_CORE u64 core_forceidle_sum; #endif #endif /* CONFIG_SCHEDSTATS */ } ____cacheline_aligned; struct sched_entity { /* For load-balancing: */ struct load_weight load; struct rb_node run_node; u64 deadline; u64 min_vruntime; u64 min_slice; struct list_head group_node; unsigned char on_rq; unsigned char sched_delayed; unsigned char rel_deadline; unsigned char custom_slice; /* hole */ u64 exec_start; u64 sum_exec_runtime; u64 prev_sum_exec_runtime; u64 vruntime; union { /* * When !@on_rq this field is vlag. * When cfs_rq->curr == se (which implies @on_rq) * this field is vprot. See protect_slice(). */ s64 vlag; u64 vprot; }; u64 slice; u64 nr_migrations; #ifdef CONFIG_FAIR_GROUP_SCHED int depth; struct sched_entity *parent; /* rq on which this entity is (to be) queued: */ struct cfs_rq *cfs_rq; /* rq "owned" by this entity/group: */ struct cfs_rq *my_q; /* cached value of my_q->h_nr_running */ unsigned long runnable_weight; #endif /* * Per entity load average tracking. * * Put into separate cache line so it does not * collide with read-mostly values above. */ struct sched_avg avg; }; struct sched_rt_entity { struct list_head run_list; unsigned long timeout; unsigned long watchdog_stamp; unsigned int time_slice; unsigned short on_rq; unsigned short on_list; struct sched_rt_entity *back; #ifdef CONFIG_RT_GROUP_SCHED struct sched_rt_entity *parent; /* rq on which this entity is (to be) queued: */ struct rt_rq *rt_rq; /* rq "owned" by this entity/group: */ struct rt_rq *my_q; #endif } __randomize_layout; typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *); typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *); struct sched_dl_entity { struct rb_node rb_node; /* * Original scheduling parameters. Copied here from sched_attr * during sched_setattr(), they will remain the same until * the next sched_setattr(). */ u64 dl_runtime; /* Maximum runtime for each instance */ u64 dl_deadline; /* Relative deadline of each instance */ u64 dl_period; /* Separation of two instances (period) */ u64 dl_bw; /* dl_runtime / dl_period */ u64 dl_density; /* dl_runtime / dl_deadline */ /* * Actual scheduling parameters. Initialized with the values above, * they are continuously updated during task execution. Note that * the remaining runtime could be < 0 in case we are in overrun. */ s64 runtime; /* Remaining runtime for this instance */ u64 deadline; /* Absolute deadline for this instance */ unsigned int flags; /* Specifying the scheduler behaviour */ /* * Some bool flags: * * @dl_throttled tells if we exhausted the runtime. If so, the * task has to wait for a replenishment to be performed at the * next firing of dl_timer. * * @dl_yielded tells if task gave up the CPU before consuming * all its available runtime during the last job. * * @dl_non_contending tells if the task is inactive while still * contributing to the active utilization. In other words, it * indicates if the inactive timer has been armed and its handler * has not been executed yet. This flag is useful to avoid race * conditions between the inactive timer handler and the wakeup * code. * * @dl_overrun tells if the task asked to be informed about runtime * overruns. * * @dl_server tells if this is a server entity. * * @dl_defer tells if this is a deferred or regular server. For * now only defer server exists. * * @dl_defer_armed tells if the deferrable server is waiting * for the replenishment timer to activate it. * * @dl_server_active tells if the dlserver is active(started). * dlserver is started on first cfs enqueue on an idle runqueue * and is stopped when a dequeue results in 0 cfs tasks on the * runqueue. In other words, dlserver is active only when cpu's * runqueue has atleast one cfs task. * * @dl_defer_running tells if the deferrable server is actually * running, skipping the defer phase. */ unsigned int dl_throttled : 1; unsigned int dl_yielded : 1; unsigned int dl_non_contending : 1; unsigned int dl_overrun : 1; unsigned int dl_server : 1; unsigned int dl_server_active : 1; unsigned int dl_defer : 1; unsigned int dl_defer_armed : 1; unsigned int dl_defer_running : 1; unsigned int dl_server_idle : 1; /* * Bandwidth enforcement timer. Each -deadline task has its * own bandwidth to be enforced, thus we need one timer per task. */ struct hrtimer dl_timer; /* * Inactive timer, responsible for decreasing the active utilization * at the "0-lag time". When a -deadline task blocks, it contributes * to GRUB's active utilization until the "0-lag time", hence a * timer is needed to decrease the active utilization at the correct * time. */ struct hrtimer inactive_timer; /* * Bits for DL-server functionality. Also see the comment near * dl_server_update(). * * @rq the runqueue this server is for * * @server_has_tasks() returns true if @server_pick return a * runnable task. */ struct rq *rq; dl_server_has_tasks_f server_has_tasks; dl_server_pick_f server_pick_task; #ifdef CONFIG_RT_MUTEXES /* * Priority Inheritance. When a DEADLINE scheduling entity is boosted * pi_se points to the donor, otherwise points to the dl_se it belongs * to (the original one/itself). */ struct sched_dl_entity *pi_se; #endif }; #ifdef CONFIG_UCLAMP_TASK /* Number of utilization clamp buckets (shorter alias) */ #define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT /* * Utilization clamp for a scheduling entity * @value: clamp value "assigned" to a se * @bucket_id: bucket index corresponding to the "assigned" value * @active: the se is currently refcounted in a rq's bucket * @user_defined: the requested clamp value comes from user-space * * The bucket_id is the index of the clamp bucket matching the clamp value * which is pre-computed and stored to avoid expensive integer divisions from * the fast path. * * The active bit is set whenever a task has got an "effective" value assigned, * which can be different from the clamp value "requested" from user-space. * This allows to know a task is refcounted in the rq's bucket corresponding * to the "effective" bucket_id. * * The user_defined bit is set whenever a task has got a task-specific clamp * value requested from userspace, i.e. the system defaults apply to this task * just as a restriction. This allows to relax default clamps when a less * restrictive task-specific value has been requested, thus allowing to * implement a "nice" semantic. For example, a task running with a 20% * default boost can still drop its own boosting to 0%. */ struct uclamp_se { unsigned int value : bits_per(SCHED_CAPACITY_SCALE); unsigned int bucket_id : bits_per(UCLAMP_BUCKETS); unsigned int active : 1; unsigned int user_defined : 1; }; #endif /* CONFIG_UCLAMP_TASK */ union rcu_special { struct { u8 blocked; u8 need_qs; u8 exp_hint; /* Hint for performance. */ u8 need_mb; /* Readers need smp_mb(). */ } b; /* Bits. */ u32 s; /* Set of bits. */ }; enum perf_event_task_context { perf_invalid_context = -1, perf_hw_context = 0, perf_sw_context, perf_nr_task_contexts, }; /* * Number of contexts where an event can trigger: * task, softirq, hardirq, nmi. */ #define PERF_NR_CONTEXTS 4 struct wake_q_node { struct wake_q_node *next; }; struct kmap_ctrl { #ifdef CONFIG_KMAP_LOCAL int idx; pte_t pteval[KM_MAX_IDX]; #endif }; struct task_struct { #ifdef CONFIG_THREAD_INFO_IN_TASK /* * For reasons of header soup (see current_thread_info()), this * must be the first element of task_struct. */ struct thread_info thread_info; #endif unsigned int __state; /* saved state for "spinlock sleepers" */ unsigned int saved_state; /* * This begins the randomizable portion of task_struct. Only * scheduling-critical items should be added above here. */ randomized_struct_fields_start void *stack; refcount_t usage; /* Per task flags (PF_*), defined further below: */ unsigned int flags; unsigned int ptrace; #ifdef CONFIG_MEM_ALLOC_PROFILING struct alloc_tag *alloc_tag; #endif int on_cpu; struct __call_single_node wake_entry; unsigned int wakee_flips; unsigned long wakee_flip_decay_ts; struct task_struct *last_wakee; /* * recent_used_cpu is initially set as the last CPU used by a task * that wakes affine another task. Waker/wakee relationships can * push tasks around a CPU where each wakeup moves to the next one. * Tracking a recently used CPU allows a quick search for a recently * used CPU that may be idle. */ int recent_used_cpu; int wake_cpu; int on_rq; int prio; int static_prio; int normal_prio; unsigned int rt_priority; struct sched_entity se; struct sched_rt_entity rt; struct sched_dl_entity dl; struct sched_dl_entity *dl_server; #ifdef CONFIG_SCHED_CLASS_EXT struct sched_ext_entity scx; #endif const struct sched_class *sched_class; #ifdef CONFIG_SCHED_CORE struct rb_node core_node; unsigned long core_cookie; unsigned int core_occupation; #endif #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif #ifdef CONFIG_UCLAMP_TASK /* * Clamp values requested for a scheduling entity. * Must be updated with task_rq_lock() held. */ struct uclamp_se uclamp_req[UCLAMP_CNT]; /* * Effective clamp values used for a scheduling entity. * Must be updated with task_rq_lock() held. */ struct uclamp_se uclamp[UCLAMP_CNT]; #endif struct sched_statistics stats; #ifdef CONFIG_PREEMPT_NOTIFIERS /* List of struct preempt_notifier: */ struct hlist_head preempt_notifiers; #endif #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif unsigned int policy; unsigned long max_allowed_capacity; int nr_cpus_allowed; const cpumask_t *cpus_ptr; cpumask_t *user_cpus_ptr; cpumask_t cpus_mask; void *migration_pending; unsigned short migration_disabled; unsigned short migration_flags; #ifdef CONFIG_PREEMPT_RCU int rcu_read_lock_nesting; union rcu_special rcu_read_unlock_special; struct list_head rcu_node_entry; struct rcu_node *rcu_blocked_node; #endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TASKS_RCU unsigned long rcu_tasks_nvcsw; u8 rcu_tasks_holdout; u8 rcu_tasks_idx; int rcu_tasks_idle_cpu; struct list_head rcu_tasks_holdout_list; int rcu_tasks_exit_cpu; struct list_head rcu_tasks_exit_list; #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_TRACE_RCU int trc_reader_nesting; int trc_ipi_to_cpu; union rcu_special trc_reader_special; struct list_head trc_holdout_list; struct list_head trc_blkd_node; int trc_blkd_cpu; #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ struct sched_info sched_info; struct list_head tasks; struct plist_node pushable_tasks; struct rb_node pushable_dl_tasks; struct mm_struct *mm; struct mm_struct *active_mm; struct address_space *faults_disabled_mapping; int exit_state; int exit_code; int exit_signal; /* The signal sent when the parent dies: */ int pdeath_signal; /* JOBCTL_*, siglock protected: */ unsigned long jobctl; /* Used for emulating ABI behavior of previous Linux versions: */ unsigned int personality; /* Scheduler bits, serialized by scheduler locks: */ unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; unsigned sched_task_hot:1; /* Force alignment to the next boundary: */ unsigned :0; /* Unserialized, strictly 'current' */ /* * This field must not be in the scheduler word above due to wakelist * queueing no longer being serialized by p->on_cpu. However: * * p->XXX = X; ttwu() * schedule() if (p->on_rq && ..) // false * smp_mb__after_spinlock(); if (smp_load_acquire(&p->on_cpu) && //true * deactivate_task() ttwu_queue_wakelist()) * p->on_rq = 0; p->sched_remote_wakeup = Y; * * guarantees all stores of 'current' are visible before * ->sched_remote_wakeup gets used, so it can be in this word. */ unsigned sched_remote_wakeup:1; #ifdef CONFIG_RT_MUTEXES unsigned sched_rt_mutex:1; #endif /* Bit to tell TOMOYO we're in execve(): */ unsigned in_execve:1; unsigned in_iowait:1; #ifndef TIF_RESTORE_SIGMASK unsigned restore_sigmask:1; #endif #ifdef CONFIG_MEMCG_V1 unsigned in_user_fault:1; #endif #ifdef CONFIG_LRU_GEN /* whether the LRU algorithm may apply to this access */ unsigned in_lru_fault:1; #endif #ifdef CONFIG_COMPAT_BRK unsigned brk_randomized:1; #endif #ifdef CONFIG_CGROUPS /* disallow userland-initiated cgroup migration */ unsigned no_cgroup_migration:1; /* task is frozen/stopped (used by the cgroup freezer) */ unsigned frozen:1; #endif #ifdef CONFIG_BLK_CGROUP unsigned use_memdelay:1; #endif #ifdef CONFIG_PSI /* Stalled due to lack of memory */ unsigned in_memstall:1; #endif #ifdef CONFIG_PAGE_OWNER /* Used by page_owner=on to detect recursion in page tracking. */ unsigned in_page_owner:1; #endif #ifdef CONFIG_EVENTFD /* Recursion prevention for eventfd_signal() */ unsigned in_eventfd:1; #endif #ifdef CONFIG_ARCH_HAS_CPU_PASID unsigned pasid_activated:1; #endif #ifdef CONFIG_X86_BUS_LOCK_DETECT unsigned reported_split_lock:1; #endif #ifdef CONFIG_TASK_DELAY_ACCT /* delay due to memory thrashing */ unsigned in_thrashing:1; #endif unsigned in_nf_duplicate:1; #ifdef CONFIG_PREEMPT_RT struct netdev_xmit net_xmit; #endif unsigned long atomic_flags; /* Flags requiring atomic access. */ struct restart_block restart_block; pid_t pid; pid_t tgid; #ifdef CONFIG_STACKPROTECTOR /* Canary value for the -fstack-protector GCC feature: */ unsigned long stack_canary; #endif /* * Pointers to the (original) parent process, youngest child, younger sibling, * older sibling, respectively. (p->father can be replaced with * p->real_parent->pid) */ /* Real parent process: */ struct task_struct __rcu *real_parent; /* Recipient of SIGCHLD, wait4() reports: */ struct task_struct __rcu *parent; /* * Children/sibling form the list of natural children: */ struct list_head children; struct list_head sibling; struct task_struct *group_leader; /* * 'ptraced' is the list of tasks this task is using ptrace() on. * * This includes both natural children and PTRACE_ATTACH targets. * 'ptrace_entry' is this task's link on the p->parent->ptraced list. */ struct list_head ptraced; struct list_head ptrace_entry; /* PID/PID hash table linkage. */ struct pid *thread_pid; struct hlist_node pid_links[PIDTYPE_MAX]; struct list_head thread_node; struct completion *vfork_done; /* CLONE_CHILD_SETTID: */ int __user *set_child_tid; /* CLONE_CHILD_CLEARTID: */ int __user *clear_child_tid; /* PF_KTHREAD | PF_IO_WORKER */ void *worker_private; u64 utime; u64 stime; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME u64 utimescaled; u64 stimescaled; #endif u64 gtime; struct prev_cputime prev_cputime; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN struct vtime vtime; #endif #ifdef CONFIG_NO_HZ_FULL atomic_t tick_dep_mask; #endif /* Context switch counts: */ unsigned long nvcsw; unsigned long nivcsw; /* Monotonic time in nsecs: */ u64 start_time; /* Boot based time in nsecs: */ u64 start_boottime; /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */ unsigned long min_flt; unsigned long maj_flt; /* Empty if CONFIG_POSIX_CPUTIMERS=n */ struct posix_cputimers posix_cputimers; #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK struct posix_cputimers_work posix_cputimers_work; #endif /* Process credentials: */ /* Tracer's credentials at attach: */ const struct cred __rcu *ptracer_cred; /* Objective and real subjective task credentials (COW): */ const struct cred __rcu *real_cred; /* Effective (overridable) subjective task credentials (COW): */ const struct cred __rcu *cred; #ifdef CONFIG_KEYS /* Cached requested key. */ struct key *cached_requested_key; #endif /* * executable name, excluding path. * * - normally initialized begin_new_exec() * - set it with set_task_comm() * - strscpy_pad() to ensure it is always NUL-terminated and * zero-padded * - task_lock() to ensure the operation is atomic and the name is * fully updated. */ char comm[TASK_COMM_LEN]; struct nameidata *nameidata; #ifdef CONFIG_SYSVIPC struct sysv_sem sysvsem; struct sysv_shm sysvshm; #endif #ifdef CONFIG_DETECT_HUNG_TASK unsigned long last_switch_count; unsigned long last_switch_time; #endif /* Filesystem information: */ struct fs_struct *fs; /* Open file information: */ struct files_struct *files; #ifdef CONFIG_IO_URING struct io_uring_task *io_uring; #endif /* Namespaces: */ struct nsproxy *nsproxy; /* Signal handlers: */ struct signal_struct *signal; struct sighand_struct __rcu *sighand; sigset_t blocked; sigset_t real_blocked; /* Restored if set_restore_sigmask() was used: */ sigset_t saved_sigmask; struct sigpending pending; unsigned long sas_ss_sp; size_t sas_ss_size; unsigned int sas_ss_flags; struct callback_head *task_works; #ifdef CONFIG_AUDIT #ifdef CONFIG_AUDITSYSCALL struct audit_context *audit_context; #endif kuid_t loginuid; unsigned int sessionid; #endif struct seccomp seccomp; struct syscall_user_dispatch syscall_dispatch; /* Thread group tracking: */ u64 parent_exec_id; u64 self_exec_id; /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */ spinlock_t alloc_lock; /* Protection of the PI data structures: */ raw_spinlock_t pi_lock; struct wake_q_node wake_q; #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task: */ struct rb_root_cached pi_waiters; /* Updated under owner's pi_lock and rq lock */ struct task_struct *pi_top_task; /* Deadlock detection and priority inheritance handling: */ struct rt_mutex_waiter *pi_blocked_on; #endif struct mutex *blocked_on; /* lock we're blocked on */ #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER /* * Encoded lock address causing task block (lower 2 bits = type from * <linux/hung_task.h>). Accessed via hung_task_*() helpers. */ unsigned long blocker; #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP int non_block_count; #endif #ifdef CONFIG_TRACE_IRQFLAGS struct irqtrace_events irqtrace; unsigned int hardirq_threaded; u64 hardirq_chain_key; int softirqs_enabled; int softirq_context; int irq_config; #endif #ifdef CONFIG_PREEMPT_RT int softirq_disable_cnt; #endif #ifdef CONFIG_LOCKDEP # define MAX_LOCK_DEPTH 48UL u64 curr_chain_key; int lockdep_depth; unsigned int lockdep_recursion; struct held_lock held_locks[MAX_LOCK_DEPTH]; #endif #if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP) unsigned int in_ubsan; #endif /* Journalling filesystem info: */ void *journal_info; /* Stacked block device info: */ struct bio_list *bio_list; /* Stack plugging: */ struct blk_plug *plug; /* VM state: */ struct reclaim_state *reclaim_state; struct io_context *io_context; #ifdef CONFIG_COMPACTION struct capture_control *capture_control; #endif /* Ptrace state: */ unsigned long ptrace_message; kernel_siginfo_t *last_siginfo; struct task_io_accounting ioac; #ifdef CONFIG_PSI /* Pressure stall state */ unsigned int psi_flags; #endif #ifdef CONFIG_TASK_XACCT /* Accumulated RSS usage: */ u64 acct_rss_mem1; /* Accumulated virtual memory usage: */ u64 acct_vm_mem1; /* stime + utime since last update: */ u64 acct_timexpd; #endif #ifdef CONFIG_CPUSETS /* Protected by ->alloc_lock: */ nodemask_t mems_allowed; /* Sequence number to catch updates: */ seqcount_spinlock_t mems_allowed_seq; int cpuset_mem_spread_rotor; #endif #ifdef CONFIG_CGROUPS /* Control Group info protected by css_set_lock: */ struct css_set __rcu *cgroups; /* cg_list protected by css_set_lock and tsk->alloc_lock: */ struct list_head cg_list; #endif #ifdef CONFIG_X86_CPU_RESCTRL u32 closid; u32 rmid; #endif #ifdef CONFIG_FUTEX struct robust_list_head __user *robust_list; #ifdef CONFIG_COMPAT struct compat_robust_list_head __user *compat_robust_list; #endif struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; struct mutex futex_exit_mutex; unsigned int futex_state; #endif #ifdef CONFIG_PERF_EVENTS u8 perf_recursion[PERF_NR_CONTEXTS]; struct perf_event_context *perf_event_ctxp; struct mutex perf_event_mutex; struct list_head perf_event_list; struct perf_ctx_data __rcu *perf_ctx_data; #endif #ifdef CONFIG_DEBUG_PREEMPT unsigned long preempt_disable_ip; #endif #ifdef CONFIG_NUMA /* Protected by alloc_lock: */ struct mempolicy *mempolicy; short il_prev; u8 il_weight; short pref_node_fork; #endif #ifdef CONFIG_NUMA_BALANCING int numa_scan_seq; unsigned int numa_scan_period; unsigned int numa_scan_period_max; int numa_preferred_nid; unsigned long numa_migrate_retry; /* Migration stamp: */ u64 node_stamp; u64 last_task_numa_placement; u64 last_sum_exec_runtime; struct callback_head numa_work; /* * This pointer is only modified for current in syscall and * pagefault context (and for tasks being destroyed), so it can be read * from any of the following contexts: * - RCU read-side critical section * - current->numa_group from everywhere * - task's runqueue locked, task not running */ struct numa_group __rcu *numa_group; /* * numa_faults is an array split into four regions: * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer * in this precise order. * * faults_memory: Exponential decaying average of faults on a per-node * basis. Scheduling placement decisions are made based on these * counts. The values remain static for the duration of a PTE scan. * faults_cpu: Track the nodes the process was running on when a NUMA * hinting fault was incurred. * faults_memory_buffer and faults_cpu_buffer: Record faults per node * during the current scan window. When the scan completes, the counts * in faults_memory and faults_cpu decay and these values are copied. */ unsigned long *numa_faults; unsigned long total_numa_faults; /* * numa_faults_locality tracks if faults recorded during the last * scan window were remote/local or failed to migrate. The task scan * period is adapted based on the locality of the faults with different * weights depending on whether they were shared or private faults */ unsigned long numa_faults_locality[3]; unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_RSEQ struct rseq __user *rseq; u32 rseq_len; u32 rseq_sig; /* * RmW on rseq_event_mask must be performed atomically * with respect to preemption. */ unsigned long rseq_event_mask; # ifdef CONFIG_DEBUG_RSEQ /* * This is a place holder to save a copy of the rseq fields for * validation of read-only fields. The struct rseq has a * variable-length array at the end, so it cannot be used * directly. Reserve a size large enough for the known fields. */ char rseq_fields[sizeof(struct rseq)]; # endif #endif #ifdef CONFIG_SCHED_MM_CID int mm_cid; /* Current cid in mm */ int last_mm_cid; /* Most recent cid in mm */ int migrate_from_cpu; int mm_cid_active; /* Whether cid bitmap is active */ struct callback_head cid_work; #endif struct tlbflush_unmap_batch tlb_ubc; /* Cache last used pipe for splice(): */ struct pipe_inode_info *splice_pipe; struct page_frag task_frag; #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info *delays; #endif #ifdef CONFIG_FAULT_INJECTION int make_it_fail; unsigned int fail_nth; #endif /* * When (nr_dirtied >= nr_dirtied_pause), it's time to call * balance_dirty_pages() for a dirty throttling pause: */ int nr_dirtied; int nr_dirtied_pause; /* Start of a write-and-pause period: */ unsigned long dirty_paused_when; #ifdef CONFIG_LATENCYTOP int latency_record_count; struct latency_record latency_record[LT_SAVECOUNT]; #endif /* * Time slack values; these are used to round up poll() and * select() etc timeout values. These are in nanoseconds. */ u64 timer_slack_ns; u64 default_timer_slack_ns; #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) unsigned int kasan_depth; #endif #ifdef CONFIG_KCSAN struct kcsan_ctx kcsan_ctx; #ifdef CONFIG_TRACE_IRQFLAGS struct irqtrace_events kcsan_save_irqtrace; #endif #ifdef CONFIG_KCSAN_WEAK_MEMORY int kcsan_stack_depth; #endif #endif #ifdef CONFIG_KMSAN struct kmsan_ctx kmsan_ctx; #endif #if IS_ENABLED(CONFIG_KUNIT) struct kunit *kunit_test; #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* Index of current stored address in ret_stack: */ int curr_ret_stack; int curr_ret_depth; /* Stack of return addresses for return function tracing: */ unsigned long *ret_stack; /* Timestamp for last schedule: */ unsigned long long ftrace_timestamp; unsigned long long ftrace_sleeptime; /* * Number of functions that haven't been traced * because of depth overrun: */ atomic_t trace_overrun; /* Pause tracing: */ atomic_t tracing_graph_pause; #endif #ifdef CONFIG_TRACING /* Bitmask and counter of trace recursion: */ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ #ifdef CONFIG_KCOV /* See kernel/kcov.c for more details. */ /* Coverage collection mode enabled for this task (0 if disabled): */ unsigned int kcov_mode; /* Size of the kcov_area: */ unsigned int kcov_size; /* Buffer for coverage collection: */ void *kcov_area; /* KCOV descriptor wired with this task or NULL: */ struct kcov *kcov; /* KCOV common handle for remote coverage collection: */ u64 kcov_handle; /* KCOV sequence number: */ int kcov_sequence; /* Collect coverage from softirq context: */ unsigned int kcov_softirq; #endif #ifdef CONFIG_MEMCG_V1 struct mem_cgroup *memcg_in_oom; #endif #ifdef CONFIG_MEMCG /* Number of pages to reclaim on returning to userland: */ unsigned int memcg_nr_pages_over_high; /* Used by memcontrol for targeted memcg charge: */ struct mem_cgroup *active_memcg; /* Cache for current->cgroups->memcg->objcg lookups: */ struct obj_cgroup *objcg; #endif #ifdef CONFIG_BLK_CGROUP struct gendisk *throttle_disk; #endif #ifdef CONFIG_UPROBES struct uprobe_task *utask; #endif #if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE) unsigned int sequential_io; unsigned int sequential_io_avg; #endif struct kmap_ctrl kmap_ctrl; #ifdef CONFIG_DEBUG_ATOMIC_SLEEP unsigned long task_state_change; # ifdef CONFIG_PREEMPT_RT unsigned long saved_state_change; # endif #endif struct rcu_head rcu; refcount_t rcu_users; int pagefault_disabled; #ifdef CONFIG_MMU struct task_struct *oom_reaper_list; struct timer_list oom_reaper_timer; #endif #ifdef CONFIG_VMAP_STACK struct vm_struct *stack_vm_area; #endif #ifdef CONFIG_THREAD_INFO_IN_TASK /* A live task holds one reference: */ refcount_t stack_refcount; #endif #ifdef CONFIG_LIVEPATCH int patch_state; #endif #ifdef CONFIG_SECURITY /* Used by LSM modules for access restriction: */ void *security; #endif #ifdef CONFIG_BPF_SYSCALL /* Used by BPF task local storage */ struct bpf_local_storage __rcu *bpf_storage; /* Used for BPF run context */ struct bpf_run_ctx *bpf_ctx; #endif /* Used by BPF for per-TASK xdp storage */ struct bpf_net_context *bpf_net_context; #ifdef CONFIG_KSTACK_ERASE unsigned long lowest_stack; #endif #ifdef CONFIG_KSTACK_ERASE_METRICS unsigned long prev_lowest_stack; #endif #ifdef CONFIG_X86_MCE void __user *mce_vaddr; __u64 mce_kflags; u64 mce_addr; __u64 mce_ripv : 1, mce_whole_page : 1, __mce_reserved : 62; struct callback_head mce_kill_me; int mce_count; #endif #ifdef CONFIG_KRETPROBES struct llist_head kretprobe_instances; #endif #ifdef CONFIG_RETHOOK struct llist_head rethooks; #endif #ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH /* * If L1D flush is supported on mm context switch * then we use this callback head to queue kill work * to kill tasks that are not running on SMT disabled * cores */ struct callback_head l1d_flush_kill; #endif #ifdef CONFIG_RV /* * Per-task RV monitor, fixed in CONFIG_RV_PER_TASK_MONITORS. * If memory becomes a concern, we can think about a dynamic method. */ union rv_task_monitor rv[CONFIG_RV_PER_TASK_MONITORS]; #endif #ifdef CONFIG_USER_EVENTS struct user_event_mm *user_event_mm; #endif #ifdef CONFIG_UNWIND_USER struct unwind_task_info unwind_info; #endif /* CPU-specific state of this task: */ struct thread_struct thread; /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. */ randomized_struct_fields_end } __attribute__ ((aligned (64))); #ifdef CONFIG_SCHED_PROXY_EXEC DECLARE_STATIC_KEY_TRUE(__sched_proxy_exec); static inline bool sched_proxy_exec(void) { return static_branch_likely(&__sched_proxy_exec); } #else static inline bool sched_proxy_exec(void) { return false; } #endif #define TASK_REPORT_IDLE (TASK_REPORT + 1) #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) static inline unsigned int __task_state_index(unsigned int tsk_state, unsigned int tsk_exit_state) { unsigned int state = (tsk_state | tsk_exit_state) & TASK_REPORT; BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); if ((tsk_state & TASK_IDLE) == TASK_IDLE) state = TASK_REPORT_IDLE; /* * We're lying here, but rather than expose a completely new task state * to userspace, we can make this appear as if the task has gone through * a regular rt_mutex_lock() call. * Report frozen tasks as uninterruptible. */ if ((tsk_state & TASK_RTLOCK_WAIT) || (tsk_state & TASK_FROZEN)) state = TASK_UNINTERRUPTIBLE; return fls(state); } static inline unsigned int task_state_index(struct task_struct *tsk) { return __task_state_index(READ_ONCE(tsk->__state), tsk->exit_state); } static inline char task_index_to_char(unsigned int state) { static const char state_char[] = "RSDTtXZPI"; BUILD_BUG_ON(TASK_REPORT_MAX * 2 != 1 << (sizeof(state_char) - 1)); return state_char[state]; } static inline char task_state_to_char(struct task_struct *tsk) { return task_index_to_char(task_state_index(tsk)); } extern struct pid *cad_pid; /* * Per process flags */ #define PF_VCPU 0x00000001 /* I'm a virtual CPU */ #define PF_IDLE 0x00000002 /* I am an IDLE thread */ #define PF_EXITING 0x00000004 /* Getting shut down */ #define PF_POSTCOREDUMP 0x00000008 /* Coredumps should ignore this task */ #define PF_IO_WORKER 0x00000010 /* Task is an IO worker */ #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ #define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */ #define PF_MCE_PROCESS 0x00000080 /* Process policy on mce errors */ #define PF_SUPERPRIV 0x00000100 /* Used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* Dumped core */ #define PF_SIGNALED 0x00000400 /* Killed by a signal */ #define PF_MEMALLOC 0x00000800 /* Allocating memory to free memory. See memalloc_noreclaim_save() */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ #define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF_KCOMPACTD 0x00010000 /* I am kcompactd */ #define PF_KSWAPD 0x00020000 /* I am kswapd */ #define PF_MEMALLOC_NOFS 0x00040000 /* All allocations inherit GFP_NOFS. See memalloc_nfs_save() */ #define PF_MEMALLOC_NOIO 0x00080000 /* All allocations inherit GFP_NOIO. See memalloc_noio_save() */ #define PF_LOCAL_THROTTLE 0x00100000 /* Throttle writes only against the bdi I write to, * I am cleaning dirty pages from some other bdi. */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ #define PF__HOLE__00800000 0x00800000 #define PF__HOLE__01000000 0x01000000 #define PF__HOLE__02000000 0x02000000 #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_PIN 0x10000000 /* Allocations constrained to zones which allow long term pinning. * See memalloc_pin_save() */ #define PF_BLOCK_TS 0x20000000 /* plug has ts that needs updating */ #define PF__HOLE__40000000 0x40000000 #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ /* * Only the _current_ task can read/write to tsk->flags, but other * tasks can access tsk->flags in readonly mode for example * with tsk_used_math (like during threaded core dumping). * There is however an exception to this rule during ptrace * or during fork: the ptracer task is allowed to write to the * child->flags of its traced child (same goes for fork, the parent * can write to the child->flags), because we're guaranteed the * child is not running and in turn not changing child->flags * at the same time the parent does it. */ #define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0) #define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0) #define clear_used_math() clear_stopped_child_used_math(current) #define set_used_math() set_stopped_child_used_math(current) #define conditional_stopped_child_used_math(condition, child) \ do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0) #define conditional_used_math(condition) conditional_stopped_child_used_math(condition, current) #define copy_to_stopped_child_used_math(child) \ do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0) /* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */ #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) #define used_math() tsk_used_math(current) static __always_inline bool is_percpu_thread(void) { return (current->flags & PF_NO_SETAFFINITY) && (current->nr_cpus_allowed == 1); } /* Per-process atomic flags. */ #define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */ #define PFA_SPREAD_PAGE 1 /* Spread page cache over cpuset */ #define PFA_SPREAD_SLAB 2 /* Spread some slab caches over cpuset */ #define PFA_SPEC_SSB_DISABLE 3 /* Speculative Store Bypass disabled */ #define PFA_SPEC_SSB_FORCE_DISABLE 4 /* Speculative Store Bypass force disabled*/ #define PFA_SPEC_IB_DISABLE 5 /* Indirect branch speculation restricted */ #define PFA_SPEC_IB_FORCE_DISABLE 6 /* Indirect branch speculation permanently restricted */ #define PFA_SPEC_SSB_NOEXEC 7 /* Speculative Store Bypass clear on execve() */ #define TASK_PFA_TEST(name, func) \ static inline bool task_##func(struct task_struct *p) \ { return test_bit(PFA_##name, &p->atomic_flags); } #define TASK_PFA_SET(name, func) \ static inline void task_set_##func(struct task_struct *p) \ { set_bit(PFA_##name, &p->atomic_flags); } #define TASK_PFA_CLEAR(name, func) \ static inline void task_clear_##func(struct task_struct *p) \ { clear_bit(PFA_##name, &p->atomic_flags); } TASK_PFA_TEST(NO_NEW_PRIVS, no_new_privs) TASK_PFA_SET(NO_NEW_PRIVS, no_new_privs) TASK_PFA_TEST(SPREAD_PAGE, spread_page) TASK_PFA_SET(SPREAD_PAGE, spread_page) TASK_PFA_CLEAR(SPREAD_PAGE, spread_page) TASK_PFA_TEST(SPREAD_SLAB, spread_slab) TASK_PFA_SET(SPREAD_SLAB, spread_slab) TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab) TASK_PFA_TEST(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_SET(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_TEST(SPEC_SSB_NOEXEC, spec_ssb_noexec) TASK_PFA_SET(SPEC_SSB_NOEXEC, spec_ssb_noexec) TASK_PFA_CLEAR(SPEC_SSB_NOEXEC, spec_ssb_noexec) TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable) TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable) TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable) TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) static inline void current_restore_flags(unsigned long orig_flags, unsigned long flags) { current->flags &= ~flags; current->flags |= orig_flags & flags; } extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); extern int task_can_attach(struct task_struct *p); extern int dl_bw_alloc(int cpu, u64 dl_bw); extern void dl_bw_free(int cpu, u64 dl_bw); /* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */ extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); /** * set_cpus_allowed_ptr - set CPU affinity mask of a task * @p: the task * @new_mask: CPU affinity mask * * Return: zero if successful, or a negative error code */ extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node); extern void release_user_cpus_ptr(struct task_struct *p); extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask); extern void force_compatible_cpus_allowed_ptr(struct task_struct *p); extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p); extern int yield_to(struct task_struct *p, bool preempt); extern void set_user_nice(struct task_struct *p, long nice); extern int task_prio(const struct task_struct *p); /** * task_nice - return the nice value of a given task. * @p: the task in question. * * Return: The nice value [ -20 ... 0 ... 19 ]. */ static inline int task_nice(const struct task_struct *p) { return PRIO_TO_NICE((p)->static_prio); } extern int can_nice(const struct task_struct *p, const int nice); extern int task_curr(const struct task_struct *p); extern int idle_cpu(int cpu); extern int available_idle_cpu(int cpu); extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *); extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); extern void sched_set_fifo(struct task_struct *p); extern void sched_set_fifo_low(struct task_struct *p); extern void sched_set_normal(struct task_struct *p, int nice); extern int sched_setattr(struct task_struct *, const struct sched_attr *); extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *); extern struct task_struct *idle_task(int cpu); /** * is_idle_task - is the specified task an idle task? * @p: the task in question. * * Return: 1 if @p is an idle task. 0 otherwise. */ static __always_inline bool is_idle_task(const struct task_struct *p) { return !!(p->flags & PF_IDLE); } extern struct task_struct *curr_task(int cpu); extern void ia64_set_curr_task(int cpu, struct task_struct *p); void yield(void); union thread_union { struct task_struct task; #ifndef CONFIG_THREAD_INFO_IN_TASK struct thread_info thread_info; #endif unsigned long stack[THREAD_SIZE/sizeof(long)]; }; #ifndef CONFIG_THREAD_INFO_IN_TASK extern struct thread_info init_thread_info; #endif extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)]; #ifdef CONFIG_THREAD_INFO_IN_TASK # define task_thread_info(task) (&(task)->thread_info) #else # define task_thread_info(task) ((struct thread_info *)(task)->stack) #endif /* * find a task by one of its numerical ids * * find_task_by_pid_ns(): * finds a task by its pid in the specified namespace * find_task_by_vpid(): * finds a task by its virtual pid * * see also find_vpid() etc in include/linux/pid.h */ extern struct task_struct *find_task_by_vpid(pid_t nr); extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns); /* * find a task by its virtual pid and get the task struct */ extern struct task_struct *find_get_task_by_vpid(pid_t nr); extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); extern void wake_up_new_task(struct task_struct *tsk); extern void kick_process(struct task_struct *tsk); extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec); #define set_task_comm(tsk, from) ({ \ BUILD_BUG_ON(sizeof(from) != TASK_COMM_LEN); \ __set_task_comm(tsk, from, false); \ }) /* * - Why not use task_lock()? * User space can randomly change their names anyway, so locking for readers * doesn't make sense. For writers, locking is probably necessary, as a race * condition could lead to long-term mixed results. * The strscpy_pad() in __set_task_comm() can ensure that the task comm is * always NUL-terminated and zero-padded. Therefore the race condition between * reader and writer is not an issue. * * - BUILD_BUG_ON() can help prevent the buf from being truncated. * Since the callers don't perform any return value checks, this safeguard is * necessary. */ #define get_task_comm(buf, tsk) ({ \ BUILD_BUG_ON(sizeof(buf) < TASK_COMM_LEN); \ strscpy_pad(buf, (tsk)->comm); \ buf; \ }) static __always_inline void scheduler_ipi(void) { /* * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting * TIF_NEED_RESCHED remotely (for the first time) will also send * this IPI. */ preempt_fold_need_resched(); } extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state); /* * Set thread flags in other task's structures. * See asm/thread_info.h for TIF_xxxx flags available: */ static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag) { set_ti_thread_flag(task_thread_info(tsk), flag); } static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag) { clear_ti_thread_flag(task_thread_info(tsk), flag); } static inline void update_tsk_thread_flag(struct task_struct *tsk, int flag, bool value) { update_ti_thread_flag(task_thread_info(tsk), flag, value); } static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_and_set_ti_thread_flag(task_thread_info(tsk), flag); } static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag); } static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_ti_thread_flag(task_thread_info(tsk), flag); } static inline void set_tsk_need_resched(struct task_struct *tsk) { if (tracepoint_enabled(sched_set_need_resched_tp) && !test_tsk_thread_flag(tsk, TIF_NEED_RESCHED)) __trace_set_need_resched(tsk, TIF_NEED_RESCHED); set_tsk_thread_flag(tsk,TIF_NEED_RESCHED); } static inline void clear_tsk_need_resched(struct task_struct *tsk) { atomic_long_andnot(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY, (atomic_long_t *)&task_thread_info(tsk)->flags); } static inline int test_tsk_need_resched(struct task_struct *tsk) { return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); } /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return * value indicates whether a reschedule was done in fact. * cond_resched_lock() will drop the spinlock before scheduling, */ #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) extern int __cond_resched(void); #if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) DECLARE_STATIC_CALL(cond_resched, __cond_resched); static __always_inline int _cond_resched(void) { return static_call_mod(cond_resched)(); } #elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) extern int dynamic_cond_resched(void); static __always_inline int _cond_resched(void) { return dynamic_cond_resched(); } #else /* !CONFIG_PREEMPTION */ static inline int _cond_resched(void) { return __cond_resched(); } #endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ #else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */ static inline int _cond_resched(void) { return 0; } #endif /* !CONFIG_PREEMPTION || CONFIG_PREEMPT_DYNAMIC */ #define cond_resched() ({ \ __might_resched(__FILE__, __LINE__, 0); \ _cond_resched(); \ }) extern int __cond_resched_lock(spinlock_t *lock); extern int __cond_resched_rwlock_read(rwlock_t *lock); extern int __cond_resched_rwlock_write(rwlock_t *lock); #define MIGHT_RESCHED_RCU_SHIFT 8 #define MIGHT_RESCHED_PREEMPT_MASK ((1U << MIGHT_RESCHED_RCU_SHIFT) - 1) #ifndef CONFIG_PREEMPT_RT /* * Non RT kernels have an elevated preempt count due to the held lock, * but are not allowed to be inside a RCU read side critical section */ # define PREEMPT_LOCK_RESCHED_OFFSETS PREEMPT_LOCK_OFFSET #else /* * spin/rw_lock() on RT implies rcu_read_lock(). The might_sleep() check in * cond_resched*lock() has to take that into account because it checks for * preempt_count() and rcu_preempt_depth(). */ # define PREEMPT_LOCK_RESCHED_OFFSETS \ (PREEMPT_LOCK_OFFSET + (1U << MIGHT_RESCHED_RCU_SHIFT)) #endif #define cond_resched_lock(lock) ({ \ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ __cond_resched_lock(lock); \ }) #define cond_resched_rwlock_read(lock) ({ \ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ __cond_resched_rwlock_read(lock); \ }) #define cond_resched_rwlock_write(lock) ({ \ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ __cond_resched_rwlock_write(lock); \ }) #ifndef CONFIG_PREEMPT_RT static inline struct mutex *__get_task_blocked_on(struct task_struct *p) { struct mutex *m = p->blocked_on; if (m) lockdep_assert_held_once(&m->wait_lock); return m; } static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m) { struct mutex *blocked_on = READ_ONCE(p->blocked_on); WARN_ON_ONCE(!m); /* The task should only be setting itself as blocked */ WARN_ON_ONCE(p != current); /* Currently we serialize blocked_on under the mutex::wait_lock */ lockdep_assert_held_once(&m->wait_lock); /* * Check ensure we don't overwrite existing mutex value * with a different mutex. Note, setting it to the same * lock repeatedly is ok. */ WARN_ON_ONCE(blocked_on && blocked_on != m); WRITE_ONCE(p->blocked_on, m); } static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m) { guard(raw_spinlock_irqsave)(&m->wait_lock); __set_task_blocked_on(p, m); } static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m) { if (m) { struct mutex *blocked_on = READ_ONCE(p->blocked_on); /* Currently we serialize blocked_on under the mutex::wait_lock */ lockdep_assert_held_once(&m->wait_lock); /* * There may be cases where we re-clear already cleared * blocked_on relationships, but make sure we are not * clearing the relationship with a different lock. */ WARN_ON_ONCE(blocked_on && blocked_on != m); } WRITE_ONCE(p->blocked_on, NULL); } static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m) { guard(raw_spinlock_irqsave)(&m->wait_lock); __clear_task_blocked_on(p, m); } #else static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) { } static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) { } #endif /* !CONFIG_PREEMPT_RT */ static __always_inline bool need_resched(void) { return unlikely(tif_need_resched()); } /* * Wrappers for p->thread_info->cpu access. No-op on UP. */ #ifdef CONFIG_SMP static inline unsigned int task_cpu(const struct task_struct *p) { return READ_ONCE(task_thread_info(p)->cpu); } extern void set_task_cpu(struct task_struct *p, unsigned int cpu); #else static inline unsigned int task_cpu(const struct task_struct *p) { return 0; } static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) { } #endif /* CONFIG_SMP */ static inline bool task_is_runnable(struct task_struct *p) { return p->on_rq && !p->se.sched_delayed; } extern bool sched_task_on_rq(struct task_struct *p); extern unsigned long get_wchan(struct task_struct *p); extern struct task_struct *cpu_curr_snapshot(int cpu); /* * In order to reduce various lock holder preemption latencies provide an * interface to see if a vCPU is currently running or not. * * This allows us to terminate optimistic spin loops and block, analogous to * the native optimistic spin heuristic of testing if the lock owner task is * running or not. */ #ifndef vcpu_is_preempted static inline bool vcpu_is_preempted(int cpu) { return false; } #endif extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); extern long sched_getaffinity(pid_t pid, struct cpumask *mask); #ifndef TASK_SIZE_OF #define TASK_SIZE_OF(tsk) TASK_SIZE #endif static inline bool owner_on_cpu(struct task_struct *owner) { /* * As lock holder preemption issue, we both skip spinning if * task is not on cpu or its cpu is preempted */ return READ_ONCE(owner->on_cpu) && !vcpu_is_preempted(task_cpu(owner)); } /* Returns effective CPU energy utilization, as seen by the scheduler */ unsigned long sched_cpu_util(int cpu); #ifdef CONFIG_SCHED_CORE extern void sched_core_free(struct task_struct *tsk); extern void sched_core_fork(struct task_struct *p); extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, unsigned long uaddr); extern int sched_core_idle_cpu(int cpu); #else static inline void sched_core_free(struct task_struct *tsk) { } static inline void sched_core_fork(struct task_struct *p) { } static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } #endif extern void sched_set_stop_task(int cpu, struct task_struct *stop); #ifdef CONFIG_MEM_ALLOC_PROFILING static __always_inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) { swap(current->alloc_tag, tag); return tag; } static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old) { #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n"); #endif current->alloc_tag = old; } #else #define alloc_tag_save(_tag) NULL #define alloc_tag_restore(_tag, _old) do {} while (0) #endif #endif
26657 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __X86_KERNEL_FPU_INTERNAL_H #define __X86_KERNEL_FPU_INTERNAL_H extern struct fpstate init_fpstate; /* CPU feature check wrappers */ static __always_inline __pure bool use_xsave(void) { return cpu_feature_enabled(X86_FEATURE_XSAVE); } static __always_inline __pure bool use_fxsr(void) { return cpu_feature_enabled(X86_FEATURE_FXSR); } #ifdef CONFIG_X86_DEBUG_FPU # define WARN_ON_FPU(x) WARN_ON_ONCE(x) #else # define WARN_ON_FPU(x) ({ BUILD_BUG_ON_INVALID(x); 0; }) #endif /* Used in init.c */ extern void fpstate_init_user(struct fpstate *fpstate); extern void fpstate_reset(struct fpu *fpu); #endif
2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 // SPDX-License-Identifier: GPL-2.0-only /* * Frontend driver for the GENPIX 8pks/qpsk/DCII USB2.0 DVB-S module * * Copyright (C) 2006,2007 Alan Nisota (alannisota@gmail.com) * Copyright (C) 2006,2007 Genpix Electronics (genpix@genpix-electronics.com) * * Thanks to GENPIX for the sample code used to implement this module. * * This module is based off the vp7045 and vp702x modules */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "gp8psk-fe.h" #include <media/dvb_frontend.h> static int debug; module_param(debug, int, 0644); MODULE_PARM_DESC(debug, "Turn on/off debugging (default:off)."); #define dprintk(fmt, arg...) do { \ if (debug) \ printk(KERN_DEBUG pr_fmt("%s: " fmt), \ __func__, ##arg); \ } while (0) struct gp8psk_fe_state { struct dvb_frontend fe; void *priv; const struct gp8psk_fe_ops *ops; bool is_rev1; u8 lock; u16 snr; unsigned long next_status_check; unsigned long status_check_interval; }; static int gp8psk_tuned_to_DCII(struct dvb_frontend *fe) { struct gp8psk_fe_state *st = fe->demodulator_priv; u8 status; st->ops->in(st->priv, GET_8PSK_CONFIG, 0, 0, &status, 1); return status & bmDCtuned; } static int gp8psk_set_tuner_mode(struct dvb_frontend *fe, int mode) { struct gp8psk_fe_state *st = fe->demodulator_priv; return st->ops->out(st->priv, SET_8PSK_CONFIG, mode, 0, NULL, 0); } static int gp8psk_fe_update_status(struct gp8psk_fe_state *st) { u8 buf[6]; if (time_after(jiffies,st->next_status_check)) { st->ops->in(st->priv, GET_SIGNAL_LOCK, 0, 0, &st->lock, 1); st->ops->in(st->priv, GET_SIGNAL_STRENGTH, 0, 0, buf, 6); st->snr = (buf[1]) << 8 | buf[0]; st->next_status_check = jiffies + (st->status_check_interval*HZ)/1000; } return 0; } static int gp8psk_fe_read_status(struct dvb_frontend *fe, enum fe_status *status) { struct gp8psk_fe_state *st = fe->demodulator_priv; gp8psk_fe_update_status(st); if (st->lock) *status = FE_HAS_LOCK | FE_HAS_SYNC | FE_HAS_VITERBI | FE_HAS_SIGNAL | FE_HAS_CARRIER; else *status = 0; if (*status & FE_HAS_LOCK) st->status_check_interval = 1000; else st->status_check_interval = 100; return 0; } /* not supported by this Frontend */ static int gp8psk_fe_read_ber(struct dvb_frontend* fe, u32 *ber) { (void) fe; *ber = 0; return 0; } /* not supported by this Frontend */ static int gp8psk_fe_read_unc_blocks(struct dvb_frontend* fe, u32 *unc) { (void) fe; *unc = 0; return 0; } static int gp8psk_fe_read_snr(struct dvb_frontend* fe, u16 *snr) { struct gp8psk_fe_state *st = fe->demodulator_priv; gp8psk_fe_update_status(st); /* snr is reported in dBu*256 */ *snr = st->snr; return 0; } static int gp8psk_fe_read_signal_strength(struct dvb_frontend* fe, u16 *strength) { struct gp8psk_fe_state *st = fe->demodulator_priv; gp8psk_fe_update_status(st); /* snr is reported in dBu*256 */ /* snr / 38.4 ~= 100% strength */ /* snr * 17 returns 100% strength as 65535 */ if (st->snr > 0xf00) *strength = 0xffff; else *strength = (st->snr << 4) + st->snr; /* snr*17 */ return 0; } static int gp8psk_fe_get_tune_settings(struct dvb_frontend* fe, struct dvb_frontend_tune_settings *tune) { tune->min_delay_ms = 800; return 0; } static int gp8psk_fe_set_frontend(struct dvb_frontend *fe) { struct gp8psk_fe_state *st = fe->demodulator_priv; struct dtv_frontend_properties *c = &fe->dtv_property_cache; u8 cmd[10]; u32 freq = c->frequency * 1000; dprintk("%s()\n", __func__); cmd[4] = freq & 0xff; cmd[5] = (freq >> 8) & 0xff; cmd[6] = (freq >> 16) & 0xff; cmd[7] = (freq >> 24) & 0xff; /* backwards compatibility: DVB-S + 8-PSK were used for Turbo-FEC */ if (c->delivery_system == SYS_DVBS && c->modulation == PSK_8) c->delivery_system = SYS_TURBO; switch (c->delivery_system) { case SYS_DVBS: if (c->modulation != QPSK) { dprintk("%s: unsupported modulation selected (%d)\n", __func__, c->modulation); return -EOPNOTSUPP; } c->fec_inner = FEC_AUTO; break; case SYS_DVBS2: /* kept for backwards compatibility */ dprintk("%s: DVB-S2 delivery system selected\n", __func__); break; case SYS_TURBO: dprintk("%s: Turbo-FEC delivery system selected\n", __func__); break; default: dprintk("%s: unsupported delivery system selected (%d)\n", __func__, c->delivery_system); return -EOPNOTSUPP; } cmd[0] = c->symbol_rate & 0xff; cmd[1] = (c->symbol_rate >> 8) & 0xff; cmd[2] = (c->symbol_rate >> 16) & 0xff; cmd[3] = (c->symbol_rate >> 24) & 0xff; switch (c->modulation) { case QPSK: if (st->is_rev1) if (gp8psk_tuned_to_DCII(fe)) st->ops->reload(st->priv); switch (c->fec_inner) { case FEC_1_2: cmd[9] = 0; break; case FEC_2_3: cmd[9] = 1; break; case FEC_3_4: cmd[9] = 2; break; case FEC_5_6: cmd[9] = 3; break; case FEC_7_8: cmd[9] = 4; break; case FEC_AUTO: cmd[9] = 5; break; default: cmd[9] = 5; break; } if (c->delivery_system == SYS_TURBO) cmd[8] = ADV_MOD_TURBO_QPSK; else cmd[8] = ADV_MOD_DVB_QPSK; break; case PSK_8: /* PSK_8 is for compatibility with DN */ cmd[8] = ADV_MOD_TURBO_8PSK; switch (c->fec_inner) { case FEC_2_3: cmd[9] = 0; break; case FEC_3_4: cmd[9] = 1; break; case FEC_3_5: cmd[9] = 2; break; case FEC_5_6: cmd[9] = 3; break; case FEC_8_9: cmd[9] = 4; break; default: cmd[9] = 0; break; } break; case QAM_16: /* QAM_16 is for compatibility with DN */ cmd[8] = ADV_MOD_TURBO_16QAM; cmd[9] = 0; break; default: /* Unknown modulation */ dprintk("%s: unsupported modulation selected (%d)\n", __func__, c->modulation); return -EOPNOTSUPP; } if (st->is_rev1) gp8psk_set_tuner_mode(fe, 0); st->ops->out(st->priv, TUNE_8PSK, 0, 0, cmd, 10); st->lock = 0; st->next_status_check = jiffies; st->status_check_interval = 200; return 0; } static int gp8psk_fe_send_diseqc_msg (struct dvb_frontend* fe, struct dvb_diseqc_master_cmd *m) { struct gp8psk_fe_state *st = fe->demodulator_priv; dprintk("%s\n", __func__); if (st->ops->out(st->priv, SEND_DISEQC_COMMAND, m->msg[0], 0, m->msg, m->msg_len)) { return -EINVAL; } return 0; } static int gp8psk_fe_send_diseqc_burst(struct dvb_frontend *fe, enum fe_sec_mini_cmd burst) { struct gp8psk_fe_state *st = fe->demodulator_priv; u8 cmd; dprintk("%s\n", __func__); /* These commands are certainly wrong */ cmd = (burst == SEC_MINI_A) ? 0x00 : 0x01; if (st->ops->out(st->priv, SEND_DISEQC_COMMAND, cmd, 0, &cmd, 0)) { return -EINVAL; } return 0; } static int gp8psk_fe_set_tone(struct dvb_frontend *fe, enum fe_sec_tone_mode tone) { struct gp8psk_fe_state *st = fe->demodulator_priv; if (st->ops->out(st->priv, SET_22KHZ_TONE, (tone == SEC_TONE_ON), 0, NULL, 0)) { return -EINVAL; } return 0; } static int gp8psk_fe_set_voltage(struct dvb_frontend *fe, enum fe_sec_voltage voltage) { struct gp8psk_fe_state *st = fe->demodulator_priv; if (st->ops->out(st->priv, SET_LNB_VOLTAGE, voltage == SEC_VOLTAGE_18, 0, NULL, 0)) { return -EINVAL; } return 0; } static int gp8psk_fe_enable_high_lnb_voltage(struct dvb_frontend* fe, long onoff) { struct gp8psk_fe_state *st = fe->demodulator_priv; return st->ops->out(st->priv, USE_EXTRA_VOLT, onoff, 0, NULL, 0); } static int gp8psk_fe_send_legacy_dish_cmd (struct dvb_frontend* fe, unsigned long sw_cmd) { struct gp8psk_fe_state *st = fe->demodulator_priv; u8 cmd = sw_cmd & 0x7f; if (st->ops->out(st->priv, SET_DN_SWITCH, cmd, 0, NULL, 0)) return -EINVAL; if (st->ops->out(st->priv, SET_LNB_VOLTAGE, !!(sw_cmd & 0x80), 0, NULL, 0)) return -EINVAL; return 0; } static void gp8psk_fe_release(struct dvb_frontend* fe) { struct gp8psk_fe_state *st = fe->demodulator_priv; kfree(st); } static const struct dvb_frontend_ops gp8psk_fe_ops; struct dvb_frontend *gp8psk_fe_attach(const struct gp8psk_fe_ops *ops, void *priv, bool is_rev1) { struct gp8psk_fe_state *st; if (!ops || !ops->in || !ops->out || !ops->reload) { pr_err("Error! gp8psk-fe ops not defined.\n"); return NULL; } st = kzalloc(sizeof(struct gp8psk_fe_state), GFP_KERNEL); if (!st) return NULL; memcpy(&st->fe.ops, &gp8psk_fe_ops, sizeof(struct dvb_frontend_ops)); st->fe.demodulator_priv = st; st->ops = ops; st->priv = priv; st->is_rev1 = is_rev1; pr_info("Frontend %sattached\n", is_rev1 ? "revision 1 " : ""); return &st->fe; } EXPORT_SYMBOL_GPL(gp8psk_fe_attach); static const struct dvb_frontend_ops gp8psk_fe_ops = { .delsys = { SYS_DVBS }, .info = { .name = "Genpix DVB-S", .frequency_min_hz = 800 * MHz, .frequency_max_hz = 2250 * MHz, .frequency_stepsize_hz = 100 * kHz, .symbol_rate_min = 1000000, .symbol_rate_max = 45000000, .symbol_rate_tolerance = 500, /* ppm */ .caps = FE_CAN_INVERSION_AUTO | FE_CAN_FEC_1_2 | FE_CAN_FEC_2_3 | FE_CAN_FEC_3_4 | FE_CAN_FEC_5_6 | FE_CAN_FEC_7_8 | FE_CAN_FEC_AUTO | /* * FE_CAN_QAM_16 is for compatibility * (Myth incorrectly detects Turbo-QPSK as plain QAM-16) */ FE_CAN_QPSK | FE_CAN_QAM_16 | FE_CAN_TURBO_FEC }, .release = gp8psk_fe_release, .init = NULL, .sleep = NULL, .set_frontend = gp8psk_fe_set_frontend, .get_tune_settings = gp8psk_fe_get_tune_settings, .read_status = gp8psk_fe_read_status, .read_ber = gp8psk_fe_read_ber, .read_signal_strength = gp8psk_fe_read_signal_strength, .read_snr = gp8psk_fe_read_snr, .read_ucblocks = gp8psk_fe_read_unc_blocks, .diseqc_send_master_cmd = gp8psk_fe_send_diseqc_msg, .diseqc_send_burst = gp8psk_fe_send_diseqc_burst, .set_tone = gp8psk_fe_set_tone, .set_voltage = gp8psk_fe_set_voltage, .dishnetwork_send_legacy_command = gp8psk_fe_send_legacy_dish_cmd, .enable_high_lnb_voltage = gp8psk_fe_enable_high_lnb_voltage }; MODULE_AUTHOR("Alan Nisota <alannisota@gamil.com>"); MODULE_DESCRIPTION("Frontend Driver for Genpix DVB-S"); MODULE_VERSION("1.1"); MODULE_LICENSE("GPL");
37 46 46 46 46 147 153 153 153 153 153 37 37 37 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 // SPDX-License-Identifier: GPL-2.0-or-later /* * A generic kernel FIFO implementation * * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net> */ #include <linux/dma-mapping.h> #include <linux/err.h> #include <linux/export.h> #include <linux/kfifo.h> #include <linux/log2.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <linux/uaccess.h> /* * internal helper to calculate the unused elements in a fifo */ static inline unsigned int kfifo_unused(struct __kfifo *fifo) { return (fifo->mask + 1) - (fifo->in - fifo->out); } int __kfifo_alloc(struct __kfifo *fifo, unsigned int size, size_t esize, gfp_t gfp_mask) { /* * round up to the next power of 2, since our 'let the indices * wrap' technique works only in this case. */ size = roundup_pow_of_two(size); fifo->in = 0; fifo->out = 0; fifo->esize = esize; if (size < 2) { fifo->data = NULL; fifo->mask = 0; return -EINVAL; } fifo->data = kmalloc_array(esize, size, gfp_mask); if (!fifo->data) { fifo->mask = 0; return -ENOMEM; } fifo->mask = size - 1; return 0; } EXPORT_SYMBOL(__kfifo_alloc); void __kfifo_free(struct __kfifo *fifo) { kfree(fifo->data); fifo->in = 0; fifo->out = 0; fifo->esize = 0; fifo->data = NULL; fifo->mask = 0; } EXPORT_SYMBOL(__kfifo_free); int __kfifo_init(struct __kfifo *fifo, void *buffer, unsigned int size, size_t esize) { size /= esize; if (!is_power_of_2(size)) size = rounddown_pow_of_two(size); fifo->in = 0; fifo->out = 0; fifo->esize = esize; fifo->data = buffer; if (size < 2) { fifo->mask = 0; return -EINVAL; } fifo->mask = size - 1; return 0; } EXPORT_SYMBOL(__kfifo_init); static void kfifo_copy_in(struct __kfifo *fifo, const void *src, unsigned int len, unsigned int off) { unsigned int size = fifo->mask + 1; unsigned int esize = fifo->esize; unsigned int l; off &= fifo->mask; if (esize != 1) { off *= esize; size *= esize; len *= esize; } l = min(len, size - off); memcpy(fifo->data + off, src, l); memcpy(fifo->data, src + l, len - l); /* * make sure that the data in the fifo is up to date before * incrementing the fifo->in index counter */ smp_wmb(); } unsigned int __kfifo_in(struct __kfifo *fifo, const void *buf, unsigned int len) { unsigned int l; l = kfifo_unused(fifo); if (len > l) len = l; kfifo_copy_in(fifo, buf, len, fifo->in); fifo->in += len; return len; } EXPORT_SYMBOL(__kfifo_in); static void kfifo_copy_out(struct __kfifo *fifo, void *dst, unsigned int len, unsigned int off) { unsigned int size = fifo->mask + 1; unsigned int esize = fifo->esize; unsigned int l; off &= fifo->mask; if (esize != 1) { off *= esize; size *= esize; len *= esize; } l = min(len, size - off); memcpy(dst, fifo->data + off, l); memcpy(dst + l, fifo->data, len - l); /* * make sure that the data is copied before * incrementing the fifo->out index counter */ smp_wmb(); } unsigned int __kfifo_out_peek(struct __kfifo *fifo, void *buf, unsigned int len) { unsigned int l; l = fifo->in - fifo->out; if (len > l) len = l; kfifo_copy_out(fifo, buf, len, fifo->out); return len; } EXPORT_SYMBOL(__kfifo_out_peek); unsigned int __kfifo_out_linear(struct __kfifo *fifo, unsigned int *tail, unsigned int n) { unsigned int size = fifo->mask + 1; unsigned int off = fifo->out & fifo->mask; if (tail) *tail = off; return min3(n, fifo->in - fifo->out, size - off); } EXPORT_SYMBOL(__kfifo_out_linear); unsigned int __kfifo_out(struct __kfifo *fifo, void *buf, unsigned int len) { len = __kfifo_out_peek(fifo, buf, len); fifo->out += len; return len; } EXPORT_SYMBOL(__kfifo_out); static unsigned long kfifo_copy_from_user(struct __kfifo *fifo, const void __user *from, unsigned int len, unsigned int off, unsigned int *copied) { unsigned int size = fifo->mask + 1; unsigned int esize = fifo->esize; unsigned int l; unsigned long ret; off &= fifo->mask; if (esize != 1) { off *= esize; size *= esize; len *= esize; } l = min(len, size - off); ret = copy_from_user(fifo->data + off, from, l); if (unlikely(ret)) ret = DIV_ROUND_UP(ret + len - l, esize); else { ret = copy_from_user(fifo->data, from + l, len - l); if (unlikely(ret)) ret = DIV_ROUND_UP(ret, esize); } /* * make sure that the data in the fifo is up to date before * incrementing the fifo->in index counter */ smp_wmb(); *copied = len - ret * esize; /* return the number of elements which are not copied */ return ret; } int __kfifo_from_user(struct __kfifo *fifo, const void __user *from, unsigned long len, unsigned int *copied) { unsigned int l; unsigned long ret; unsigned int esize = fifo->esize; int err; if (esize != 1) len /= esize; l = kfifo_unused(fifo); if (len > l) len = l; ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied); if (unlikely(ret)) { len -= ret; err = -EFAULT; } else err = 0; fifo->in += len; return err; } EXPORT_SYMBOL(__kfifo_from_user); static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to, unsigned int len, unsigned int off, unsigned int *copied) { unsigned int l; unsigned long ret; unsigned int size = fifo->mask + 1; unsigned int esize = fifo->esize; off &= fifo->mask; if (esize != 1) { off *= esize; size *= esize; len *= esize; } l = min(len, size - off); ret = copy_to_user(to, fifo->data + off, l); if (unlikely(ret)) ret = DIV_ROUND_UP(ret + len - l, esize); else { ret = copy_to_user(to + l, fifo->data, len - l); if (unlikely(ret)) ret = DIV_ROUND_UP(ret, esize); } /* * make sure that the data is copied before * incrementing the fifo->out index counter */ smp_wmb(); *copied = len - ret * esize; /* return the number of elements which are not copied */ return ret; } int __kfifo_to_user(struct __kfifo *fifo, void __user *to, unsigned long len, unsigned int *copied) { unsigned int l; unsigned long ret; unsigned int esize = fifo->esize; int err; if (esize != 1) len /= esize; l = fifo->in - fifo->out; if (len > l) len = l; ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied); if (unlikely(ret)) { len -= ret; err = -EFAULT; } else err = 0; fifo->out += len; return err; } EXPORT_SYMBOL(__kfifo_to_user); static unsigned int setup_sgl_buf(struct __kfifo *fifo, struct scatterlist *sgl, unsigned int data_offset, int nents, unsigned int len, dma_addr_t dma) { const void *buf = fifo->data + data_offset; if (!nents || !len) return 0; sg_set_buf(sgl, buf, len); if (dma != DMA_MAPPING_ERROR) { sg_dma_address(sgl) = dma + data_offset; sg_dma_len(sgl) = len; } return 1; } static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl, int nents, unsigned int len, unsigned int off, dma_addr_t dma) { unsigned int size = fifo->mask + 1; unsigned int esize = fifo->esize; unsigned int len_to_end; unsigned int n; off &= fifo->mask; if (esize != 1) { off *= esize; size *= esize; len *= esize; } len_to_end = min(len, size - off); n = setup_sgl_buf(fifo, sgl, off, nents, len_to_end, dma); n += setup_sgl_buf(fifo, sgl + n, 0, nents - n, len - len_to_end, dma); return n; } unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo, struct scatterlist *sgl, int nents, unsigned int len, dma_addr_t dma) { unsigned int l; l = kfifo_unused(fifo); if (len > l) len = l; return setup_sgl(fifo, sgl, nents, len, fifo->in, dma); } EXPORT_SYMBOL(__kfifo_dma_in_prepare); unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo, struct scatterlist *sgl, int nents, unsigned int len, dma_addr_t dma) { unsigned int l; l = fifo->in - fifo->out; if (len > l) len = l; return setup_sgl(fifo, sgl, nents, len, fifo->out, dma); } EXPORT_SYMBOL(__kfifo_dma_out_prepare); unsigned int __kfifo_max_r(unsigned int len, size_t recsize) { unsigned int max = (1 << (recsize << 3)) - 1; if (len > max) return max; return len; } EXPORT_SYMBOL(__kfifo_max_r); #define __KFIFO_PEEK(data, out, mask) \ ((data)[(out) & (mask)]) /* * __kfifo_peek_n internal helper function for determinate the length of * the next record in the fifo */ static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize) { unsigned int l; unsigned int mask = fifo->mask; unsigned char *data = fifo->data; l = __KFIFO_PEEK(data, fifo->out, mask); if (--recsize) l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8; return l; } #define __KFIFO_POKE(data, in, mask, val) \ ( \ (data)[(in) & (mask)] = (unsigned char)(val) \ ) /* * __kfifo_poke_n internal helper function for storing the length of * the record into the fifo */ static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize) { unsigned int mask = fifo->mask; unsigned char *data = fifo->data; __KFIFO_POKE(data, fifo->in, mask, n); if (recsize > 1) __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8); } unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize) { return __kfifo_peek_n(fifo, recsize); } EXPORT_SYMBOL(__kfifo_len_r); unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf, unsigned int len, size_t recsize) { if (len + recsize > kfifo_unused(fifo)) return 0; __kfifo_poke_n(fifo, len, recsize); kfifo_copy_in(fifo, buf, len, fifo->in + recsize); fifo->in += len + recsize; return len; } EXPORT_SYMBOL(__kfifo_in_r); static unsigned int kfifo_out_copy_r(struct __kfifo *fifo, void *buf, unsigned int len, size_t recsize, unsigned int *n) { *n = __kfifo_peek_n(fifo, recsize); if (len > *n) len = *n; kfifo_copy_out(fifo, buf, len, fifo->out + recsize); return len; } unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf, unsigned int len, size_t recsize) { unsigned int n; if (fifo->in == fifo->out) return 0; return kfifo_out_copy_r(fifo, buf, len, recsize, &n); } EXPORT_SYMBOL(__kfifo_out_peek_r); unsigned int __kfifo_out_linear_r(struct __kfifo *fifo, unsigned int *tail, unsigned int n, size_t recsize) { if (fifo->in == fifo->out) return 0; if (tail) *tail = fifo->out + recsize; return min(n, __kfifo_peek_n(fifo, recsize)); } EXPORT_SYMBOL(__kfifo_out_linear_r); unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf, unsigned int len, size_t recsize) { unsigned int n; if (fifo->in == fifo->out) return 0; len = kfifo_out_copy_r(fifo, buf, len, recsize, &n); fifo->out += n + recsize; return len; } EXPORT_SYMBOL(__kfifo_out_r); void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize) { unsigned int n; n = __kfifo_peek_n(fifo, recsize); fifo->out += n + recsize; } EXPORT_SYMBOL(__kfifo_skip_r); int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from, unsigned long len, unsigned int *copied, size_t recsize) { unsigned long ret; len = __kfifo_max_r(len, recsize); if (len + recsize > kfifo_unused(fifo)) { *copied = 0; return 0; } __kfifo_poke_n(fifo, len, recsize); ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied); if (unlikely(ret)) { *copied = 0; return -EFAULT; } fifo->in += len + recsize; return 0; } EXPORT_SYMBOL(__kfifo_from_user_r); int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to, unsigned long len, unsigned int *copied, size_t recsize) { unsigned long ret; unsigned int n; if (fifo->in == fifo->out) { *copied = 0; return 0; } n = __kfifo_peek_n(fifo, recsize); if (len > n) len = n; ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied); if (unlikely(ret)) { *copied = 0; return -EFAULT; } fifo->out += n + recsize; return 0; } EXPORT_SYMBOL(__kfifo_to_user_r); unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo, struct scatterlist *sgl, int nents, unsigned int len, size_t recsize, dma_addr_t dma) { BUG_ON(!nents); len = __kfifo_max_r(len, recsize); if (len + recsize > kfifo_unused(fifo)) return 0; return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize, dma); } EXPORT_SYMBOL(__kfifo_dma_in_prepare_r); void __kfifo_dma_in_finish_r(struct __kfifo *fifo, unsigned int len, size_t recsize) { len = __kfifo_max_r(len, recsize); __kfifo_poke_n(fifo, len, recsize); fifo->in += len + recsize; } EXPORT_SYMBOL(__kfifo_dma_in_finish_r); unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo, struct scatterlist *sgl, int nents, unsigned int len, size_t recsize, dma_addr_t dma) { BUG_ON(!nents); len = __kfifo_max_r(len, recsize); if (len + recsize > fifo->in - fifo->out) return 0; return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize, dma); } EXPORT_SYMBOL(__kfifo_dma_out_prepare_r);
15 15 15 15 15 15 15 15 7 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 /* * llc_if.c - Defines LLC interface to upper layer * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/gfp.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/errno.h> #include <net/llc_if.h> #include <net/llc_sap.h> #include <net/llc_s_ev.h> #include <net/llc_conn.h> #include <net/sock.h> #include <net/llc_c_ev.h> #include <net/llc_c_ac.h> #include <net/llc_c_st.h> #include <net/tcp_states.h> /** * llc_build_and_send_pkt - Connection data sending for upper layers. * @sk: connection * @skb: packet to send * * This function is called when upper layer wants to send data using * connection oriented communication mode. During sending data, connection * will be locked and received frames and expired timers will be queued. * Returns 0 for success, -ECONNABORTED when the connection already * closed and -EBUSY when sending data is not permitted in this state or * LLC has send an I pdu with p bit set to 1 and is waiting for it's * response. * * This function always consumes a reference to the skb. */ int llc_build_and_send_pkt(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev; int rc = -ECONNABORTED; struct llc_sock *llc = llc_sk(sk); if (unlikely(llc->state == LLC_CONN_STATE_ADM)) goto out_free; rc = -EBUSY; if (unlikely(llc_data_accept_state(llc->state) || /* data_conn_refuse */ llc->p_flag)) { llc->failed_data_req = 1; goto out_free; } ev = llc_conn_ev(skb); ev->type = LLC_CONN_EV_TYPE_PRIM; ev->prim = LLC_DATA_PRIM; ev->prim_type = LLC_PRIM_TYPE_REQ; skb->dev = llc->dev; return llc_conn_state_process(sk, skb); out_free: kfree_skb(skb); return rc; } /** * llc_establish_connection - Called by upper layer to establish a conn * @sk: connection * @lmac: local mac address * @dmac: destination mac address * @dsap: destination sap * * Upper layer calls this to establish an LLC connection with a remote * machine. This function packages a proper event and sends it connection * component state machine. Success or failure of connection * establishment will inform to upper layer via calling it's confirm * function and passing proper information. */ int llc_establish_connection(struct sock *sk, const u8 *lmac, u8 *dmac, u8 dsap) { int rc = -EISCONN; struct llc_addr laddr, daddr; struct sk_buff *skb; struct llc_sock *llc = llc_sk(sk); struct sock *existing; laddr.lsap = llc->sap->laddr.lsap; daddr.lsap = dsap; memcpy(daddr.mac, dmac, sizeof(daddr.mac)); memcpy(laddr.mac, lmac, sizeof(laddr.mac)); existing = llc_lookup_established(llc->sap, &daddr, &laddr, sock_net(sk)); if (existing) { if (existing->sk_state == TCP_ESTABLISHED) { sk = existing; goto out_put; } else sock_put(existing); } sock_hold(sk); rc = -ENOMEM; skb = alloc_skb(0, GFP_ATOMIC); if (skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->type = LLC_CONN_EV_TYPE_PRIM; ev->prim = LLC_CONN_PRIM; ev->prim_type = LLC_PRIM_TYPE_REQ; skb_set_owner_w(skb, sk); rc = llc_conn_state_process(sk, skb); } out_put: sock_put(sk); return rc; } /** * llc_send_disc - Called by upper layer to close a connection * @sk: connection to be closed * * Upper layer calls this when it wants to close an established LLC * connection with a remote machine. This function packages a proper event * and sends it to connection component state machine. Returns 0 for * success, 1 otherwise. */ int llc_send_disc(struct sock *sk) { u16 rc = 1; struct llc_conn_state_ev *ev; struct sk_buff *skb; sock_hold(sk); if (sk->sk_type != SOCK_STREAM || sk->sk_state != TCP_ESTABLISHED || llc_sk(sk)->state == LLC_CONN_STATE_ADM || llc_sk(sk)->state == LLC_CONN_OUT_OF_SVC) goto out; /* * Postpone unassigning the connection from its SAP and returning the * connection until all ACTIONs have been completely executed */ skb = alloc_skb(0, GFP_ATOMIC); if (!skb) goto out; skb_set_owner_w(skb, sk); sk->sk_state = TCP_CLOSING; ev = llc_conn_ev(skb); ev->type = LLC_CONN_EV_TYPE_PRIM; ev->prim = LLC_DISC_PRIM; ev->prim_type = LLC_PRIM_TYPE_REQ; rc = llc_conn_state_process(sk, skb); out: sock_put(sk); return rc; }
9 2 1 2 1 1 2 1 2 1 1 1 1 14 14 2 14 14 14 7 14 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 // SPDX-License-Identifier: GPL-2.0 /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * CLC (connection layer control) handshake over initial TCP socket to * prepare for RDMA traffic * * Copyright IBM Corp. 2016, 2018 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> */ #include <linux/in.h> #include <linux/inetdevice.h> #include <linux/if_ether.h> #include <linux/sched/signal.h> #include <linux/utsname.h> #include <linux/ctype.h> #include <net/addrconf.h> #include <net/sock.h> #include <net/tcp.h> #include "smc.h" #include "smc_core.h" #include "smc_clc.h" #include "smc_ib.h" #include "smc_ism.h" #include "smc_netlink.h" #define SMCR_CLC_ACCEPT_CONFIRM_LEN 68 #define SMCD_CLC_ACCEPT_CONFIRM_LEN 48 #define SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 78 #define SMCR_CLC_ACCEPT_CONFIRM_LEN_V2 108 #define SMC_CLC_RECV_BUF_LEN 100 /* eye catcher "SMCR" EBCDIC for CLC messages */ static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'}; /* eye catcher "SMCD" EBCDIC for CLC messages */ static const char SMCD_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xc4'}; static u8 smc_hostname[SMC_MAX_HOSTNAME_LEN]; struct smc_clc_eid_table { rwlock_t lock; struct list_head list; u8 ueid_cnt; u8 seid_enabled; }; static struct smc_clc_eid_table smc_clc_eid_table; struct smc_clc_eid_entry { struct list_head list; u8 eid[SMC_MAX_EID_LEN]; }; /* The size of a user EID is 32 characters. * Valid characters should be (single-byte character set) A-Z, 0-9, '.' and '-'. * Blanks should only be used to pad to the expected size. * First character must be alphanumeric. */ static bool smc_clc_ueid_valid(char *ueid) { char *end = ueid + SMC_MAX_EID_LEN; while (--end >= ueid && isspace(*end)) ; if (end < ueid) return false; if (!isalnum(*ueid) || islower(*ueid)) return false; while (ueid <= end) { if ((!isalnum(*ueid) || islower(*ueid)) && *ueid != '.' && *ueid != '-') return false; ueid++; } return true; } static int smc_clc_ueid_add(char *ueid) { struct smc_clc_eid_entry *new_ueid, *tmp_ueid; int rc; if (!smc_clc_ueid_valid(ueid)) return -EINVAL; /* add a new ueid entry to the ueid table if there isn't one */ new_ueid = kzalloc(sizeof(*new_ueid), GFP_KERNEL); if (!new_ueid) return -ENOMEM; memcpy(new_ueid->eid, ueid, SMC_MAX_EID_LEN); write_lock(&smc_clc_eid_table.lock); if (smc_clc_eid_table.ueid_cnt >= SMC_MAX_UEID) { rc = -ERANGE; goto err_out; } list_for_each_entry(tmp_ueid, &smc_clc_eid_table.list, list) { if (!memcmp(tmp_ueid->eid, ueid, SMC_MAX_EID_LEN)) { rc = -EEXIST; goto err_out; } } list_add_tail(&new_ueid->list, &smc_clc_eid_table.list); smc_clc_eid_table.ueid_cnt++; write_unlock(&smc_clc_eid_table.lock); return 0; err_out: write_unlock(&smc_clc_eid_table.lock); kfree(new_ueid); return rc; } int smc_clc_ueid_count(void) { int count; read_lock(&smc_clc_eid_table.lock); count = smc_clc_eid_table.ueid_cnt; read_unlock(&smc_clc_eid_table.lock); return count; } int smc_nl_add_ueid(struct sk_buff *skb, struct genl_info *info) { struct nlattr *nla_ueid = info->attrs[SMC_NLA_EID_TABLE_ENTRY]; char *ueid; if (!nla_ueid || nla_len(nla_ueid) != SMC_MAX_EID_LEN + 1) return -EINVAL; ueid = (char *)nla_data(nla_ueid); return smc_clc_ueid_add(ueid); } /* remove one or all ueid entries from the table */ static int smc_clc_ueid_remove(char *ueid) { struct smc_clc_eid_entry *lst_ueid, *tmp_ueid; int rc = -ENOENT; /* remove table entry */ write_lock(&smc_clc_eid_table.lock); list_for_each_entry_safe(lst_ueid, tmp_ueid, &smc_clc_eid_table.list, list) { if (!ueid || !memcmp(lst_ueid->eid, ueid, SMC_MAX_EID_LEN)) { list_del(&lst_ueid->list); smc_clc_eid_table.ueid_cnt--; kfree(lst_ueid); rc = 0; } } #if IS_ENABLED(CONFIG_S390) if (!rc && !smc_clc_eid_table.ueid_cnt) { smc_clc_eid_table.seid_enabled = 1; rc = -EAGAIN; /* indicate success and enabling of seid */ } #endif write_unlock(&smc_clc_eid_table.lock); return rc; } int smc_nl_remove_ueid(struct sk_buff *skb, struct genl_info *info) { struct nlattr *nla_ueid = info->attrs[SMC_NLA_EID_TABLE_ENTRY]; char *ueid; if (!nla_ueid || nla_len(nla_ueid) != SMC_MAX_EID_LEN + 1) return -EINVAL; ueid = (char *)nla_data(nla_ueid); return smc_clc_ueid_remove(ueid); } int smc_nl_flush_ueid(struct sk_buff *skb, struct genl_info *info) { smc_clc_ueid_remove(NULL); return 0; } static int smc_nl_ueid_dumpinfo(struct sk_buff *skb, u32 portid, u32 seq, u32 flags, char *ueid) { char ueid_str[SMC_MAX_EID_LEN + 1]; void *hdr; hdr = genlmsg_put(skb, portid, seq, &smc_gen_nl_family, flags, SMC_NETLINK_DUMP_UEID); if (!hdr) return -ENOMEM; memcpy(ueid_str, ueid, SMC_MAX_EID_LEN); ueid_str[SMC_MAX_EID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_EID_TABLE_ENTRY, ueid_str)) { genlmsg_cancel(skb, hdr); return -EMSGSIZE; } genlmsg_end(skb, hdr); return 0; } static int _smc_nl_ueid_dump(struct sk_buff *skb, u32 portid, u32 seq, int start_idx) { struct smc_clc_eid_entry *lst_ueid; int idx = 0; read_lock(&smc_clc_eid_table.lock); list_for_each_entry(lst_ueid, &smc_clc_eid_table.list, list) { if (idx++ < start_idx) continue; if (smc_nl_ueid_dumpinfo(skb, portid, seq, NLM_F_MULTI, lst_ueid->eid)) { --idx; break; } } read_unlock(&smc_clc_eid_table.lock); return idx; } int smc_nl_dump_ueid(struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); int idx; idx = _smc_nl_ueid_dump(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb_ctx->pos[0]); cb_ctx->pos[0] = idx; return skb->len; } int smc_nl_dump_seid(struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); char seid_str[SMC_MAX_EID_LEN + 1]; u8 seid_enabled; void *hdr; u8 *seid; if (cb_ctx->pos[0]) return skb->len; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_DUMP_SEID); if (!hdr) return -ENOMEM; if (!smc_ism_is_v2_capable()) goto end; smc_ism_get_system_eid(&seid); memcpy(seid_str, seid, SMC_MAX_EID_LEN); seid_str[SMC_MAX_EID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_SEID_ENTRY, seid_str)) goto err; read_lock(&smc_clc_eid_table.lock); seid_enabled = smc_clc_eid_table.seid_enabled; read_unlock(&smc_clc_eid_table.lock); if (nla_put_u8(skb, SMC_NLA_SEID_ENABLED, seid_enabled)) goto err; end: genlmsg_end(skb, hdr); cb_ctx->pos[0]++; return skb->len; err: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } int smc_nl_enable_seid(struct sk_buff *skb, struct genl_info *info) { #if IS_ENABLED(CONFIG_S390) write_lock(&smc_clc_eid_table.lock); smc_clc_eid_table.seid_enabled = 1; write_unlock(&smc_clc_eid_table.lock); return 0; #else return -EOPNOTSUPP; #endif } int smc_nl_disable_seid(struct sk_buff *skb, struct genl_info *info) { int rc = 0; #if IS_ENABLED(CONFIG_S390) write_lock(&smc_clc_eid_table.lock); if (!smc_clc_eid_table.ueid_cnt) rc = -ENOENT; else smc_clc_eid_table.seid_enabled = 0; write_unlock(&smc_clc_eid_table.lock); #else rc = -EOPNOTSUPP; #endif return rc; } static bool _smc_clc_match_ueid(u8 *peer_ueid) { struct smc_clc_eid_entry *tmp_ueid; list_for_each_entry(tmp_ueid, &smc_clc_eid_table.list, list) { if (!memcmp(tmp_ueid->eid, peer_ueid, SMC_MAX_EID_LEN)) return true; } return false; } bool smc_clc_match_eid(u8 *negotiated_eid, struct smc_clc_v2_extension *smc_v2_ext, u8 *peer_eid, u8 *local_eid) { bool match = false; int i; negotiated_eid[0] = 0; read_lock(&smc_clc_eid_table.lock); if (peer_eid && local_eid && smc_clc_eid_table.seid_enabled && smc_v2_ext->hdr.flag.seid && !memcmp(peer_eid, local_eid, SMC_MAX_EID_LEN)) { memcpy(negotiated_eid, peer_eid, SMC_MAX_EID_LEN); match = true; goto out; } for (i = 0; i < smc_v2_ext->hdr.eid_cnt; i++) { if (_smc_clc_match_ueid(smc_v2_ext->user_eids[i])) { memcpy(negotiated_eid, smc_v2_ext->user_eids[i], SMC_MAX_EID_LEN); match = true; goto out; } } out: read_unlock(&smc_clc_eid_table.lock); return match; } /* check arriving CLC proposal */ static bool smc_clc_msg_prop_valid(struct smc_clc_msg_proposal *pclc) { struct smc_clc_msg_proposal_prefix *pclc_prfx; struct smc_clc_smcd_v2_extension *smcd_v2_ext; struct smc_clc_msg_hdr *hdr = &pclc->hdr; struct smc_clc_v2_extension *v2_ext; pclc_prfx = smc_clc_proposal_get_prefix(pclc); if (!pclc_prfx || pclc_prfx->ipv6_prefixes_cnt > SMC_CLC_MAX_V6_PREFIX) return false; if (hdr->version == SMC_V1) { if (hdr->typev1 == SMC_TYPE_N) return false; if (ntohs(hdr->length) != sizeof(*pclc) + ntohs(pclc->iparea_offset) + sizeof(*pclc_prfx) + pclc_prfx->ipv6_prefixes_cnt * sizeof(struct smc_clc_ipv6_prefix) + sizeof(struct smc_clc_msg_trail)) return false; } else { v2_ext = smc_get_clc_v2_ext(pclc); if ((hdr->typev2 != SMC_TYPE_N && (!v2_ext || v2_ext->hdr.eid_cnt > SMC_CLC_MAX_UEID)) || (smcd_indicated(hdr->typev2) && v2_ext->hdr.ism_gid_cnt > SMCD_CLC_MAX_V2_GID_ENTRIES)) return false; if (ntohs(hdr->length) != sizeof(*pclc) + sizeof(struct smc_clc_msg_smcd) + (hdr->typev1 != SMC_TYPE_N ? sizeof(*pclc_prfx) + pclc_prfx->ipv6_prefixes_cnt * sizeof(struct smc_clc_ipv6_prefix) : 0) + (hdr->typev2 != SMC_TYPE_N ? sizeof(*v2_ext) + v2_ext->hdr.eid_cnt * SMC_MAX_EID_LEN : 0) + (smcd_indicated(hdr->typev2) ? sizeof(*smcd_v2_ext) + v2_ext->hdr.ism_gid_cnt * sizeof(struct smc_clc_smcd_gid_chid) : 0) + sizeof(struct smc_clc_msg_trail)) return false; } return true; } /* check arriving CLC accept or confirm */ static bool smc_clc_msg_acc_conf_valid(struct smc_clc_msg_accept_confirm *clc) { struct smc_clc_msg_hdr *hdr = &clc->hdr; if (hdr->typev1 != SMC_TYPE_R && hdr->typev1 != SMC_TYPE_D) return false; if (hdr->version == SMC_V1) { if ((hdr->typev1 == SMC_TYPE_R && ntohs(hdr->length) != SMCR_CLC_ACCEPT_CONFIRM_LEN) || (hdr->typev1 == SMC_TYPE_D && ntohs(hdr->length) != SMCD_CLC_ACCEPT_CONFIRM_LEN)) return false; } else { if (hdr->typev1 == SMC_TYPE_D && ntohs(hdr->length) < SMCD_CLC_ACCEPT_CONFIRM_LEN_V2) return false; if (hdr->typev1 == SMC_TYPE_R && ntohs(hdr->length) < SMCR_CLC_ACCEPT_CONFIRM_LEN_V2) return false; } return true; } /* check arriving CLC decline */ static bool smc_clc_msg_decl_valid(struct smc_clc_msg_decline *dclc) { struct smc_clc_msg_hdr *hdr = &dclc->hdr; if (hdr->version == SMC_V1) { if (ntohs(hdr->length) != sizeof(struct smc_clc_msg_decline)) return false; } else { if (ntohs(hdr->length) != sizeof(struct smc_clc_msg_decline_v2)) return false; } return true; } static int smc_clc_fill_fce_v2x(struct smc_clc_first_contact_ext_v2x *fce_v2x, struct smc_init_info *ini) { int ret = sizeof(*fce_v2x); memset(fce_v2x, 0, sizeof(*fce_v2x)); fce_v2x->fce_v2_base.os_type = SMC_CLC_OS_LINUX; fce_v2x->fce_v2_base.release = ini->release_nr; memcpy(fce_v2x->fce_v2_base.hostname, smc_hostname, sizeof(smc_hostname)); if (ini->is_smcd && ini->release_nr < SMC_RELEASE_1) { ret = sizeof(struct smc_clc_first_contact_ext); goto out; } if (ini->release_nr >= SMC_RELEASE_1) { if (!ini->is_smcd) { fce_v2x->max_conns = ini->max_conns; fce_v2x->max_links = ini->max_links; } fce_v2x->feature_mask = htons(ini->feature_mask); } out: return ret; } /* check if received message has a correct header length and contains valid * heading and trailing eyecatchers */ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm, bool check_trl) { struct smc_clc_msg_accept_confirm *clc; struct smc_clc_msg_proposal *pclc; struct smc_clc_msg_decline *dclc; struct smc_clc_msg_trail *trl; if (memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) && memcmp(clcm->eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER))) return false; switch (clcm->type) { case SMC_CLC_PROPOSAL: pclc = (struct smc_clc_msg_proposal *)clcm; if (!smc_clc_msg_prop_valid(pclc)) return false; trl = (struct smc_clc_msg_trail *) ((u8 *)pclc + ntohs(pclc->hdr.length) - sizeof(*trl)); break; case SMC_CLC_ACCEPT: case SMC_CLC_CONFIRM: clc = (struct smc_clc_msg_accept_confirm *)clcm; if (!smc_clc_msg_acc_conf_valid(clc)) return false; trl = (struct smc_clc_msg_trail *) ((u8 *)clc + ntohs(clc->hdr.length) - sizeof(*trl)); break; case SMC_CLC_DECLINE: dclc = (struct smc_clc_msg_decline *)clcm; if (!smc_clc_msg_decl_valid(dclc)) return false; check_trl = false; break; default: return false; } if (check_trl && memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) && memcmp(trl->eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER))) return false; return true; } /* find ipv4 addr on device and get the prefix len, fill CLC proposal msg */ static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4, struct smc_clc_msg_proposal_prefix *prop) { struct in_device *in_dev = __in_dev_get_rcu(dst->dev); const struct in_ifaddr *ifa; if (!in_dev) return -ENODEV; in_dev_for_each_ifa_rcu(ifa, in_dev) { if (!inet_ifa_match(ipv4, ifa)) continue; prop->prefix_len = inet_mask_len(ifa->ifa_mask); prop->outgoing_subnet = ifa->ifa_address & ifa->ifa_mask; /* prop->ipv6_prefixes_cnt = 0; already done by memset before */ return 0; } return -ENOENT; } /* fill CLC proposal msg with ipv6 prefixes from device */ static int smc_clc_prfx_set6_rcu(struct dst_entry *dst, struct smc_clc_msg_proposal_prefix *prop, struct smc_clc_ipv6_prefix *ipv6_prfx) { #if IS_ENABLED(CONFIG_IPV6) struct inet6_dev *in6_dev = __in6_dev_get(dst->dev); struct inet6_ifaddr *ifa; int cnt = 0; if (!in6_dev) return -ENODEV; /* use a maximum of 8 IPv6 prefixes from device */ list_for_each_entry(ifa, &in6_dev->addr_list, if_list) { if (ipv6_addr_type(&ifa->addr) & IPV6_ADDR_LINKLOCAL) continue; ipv6_addr_prefix(&ipv6_prfx[cnt].prefix, &ifa->addr, ifa->prefix_len); ipv6_prfx[cnt].prefix_len = ifa->prefix_len; cnt++; if (cnt == SMC_CLC_MAX_V6_PREFIX) break; } prop->ipv6_prefixes_cnt = cnt; if (cnt) return 0; #endif return -ENOENT; } /* retrieve and set prefixes in CLC proposal msg */ static int smc_clc_prfx_set(struct socket *clcsock, struct smc_clc_msg_proposal_prefix *prop, struct smc_clc_ipv6_prefix *ipv6_prfx) { struct dst_entry *dst = sk_dst_get(clcsock->sk); struct sockaddr_storage addrs; struct sockaddr_in6 *addr6; struct sockaddr_in *addr; int rc = -ENOENT; if (!dst) { rc = -ENOTCONN; goto out; } if (!dst->dev) { rc = -ENODEV; goto out_rel; } /* get address to which the internal TCP socket is bound */ if (kernel_getsockname(clcsock, (struct sockaddr *)&addrs) < 0) goto out_rel; /* analyze IP specific data of net_device belonging to TCP socket */ addr6 = (struct sockaddr_in6 *)&addrs; rcu_read_lock(); if (addrs.ss_family == PF_INET) { /* IPv4 */ addr = (struct sockaddr_in *)&addrs; rc = smc_clc_prfx_set4_rcu(dst, addr->sin_addr.s_addr, prop); } else if (ipv6_addr_v4mapped(&addr6->sin6_addr)) { /* mapped IPv4 address - peer is IPv4 only */ rc = smc_clc_prfx_set4_rcu(dst, addr6->sin6_addr.s6_addr32[3], prop); } else { /* IPv6 */ rc = smc_clc_prfx_set6_rcu(dst, prop, ipv6_prfx); } rcu_read_unlock(); out_rel: dst_release(dst); out: return rc; } /* match ipv4 addrs of dev against addr in CLC proposal */ static int smc_clc_prfx_match4_rcu(struct net_device *dev, struct smc_clc_msg_proposal_prefix *prop) { struct in_device *in_dev = __in_dev_get_rcu(dev); const struct in_ifaddr *ifa; if (!in_dev) return -ENODEV; in_dev_for_each_ifa_rcu(ifa, in_dev) { if (prop->prefix_len == inet_mask_len(ifa->ifa_mask) && inet_ifa_match(prop->outgoing_subnet, ifa)) return 0; } return -ENOENT; } /* match ipv6 addrs of dev against addrs in CLC proposal */ static int smc_clc_prfx_match6_rcu(struct net_device *dev, struct smc_clc_msg_proposal_prefix *prop) { #if IS_ENABLED(CONFIG_IPV6) struct inet6_dev *in6_dev = __in6_dev_get(dev); struct smc_clc_ipv6_prefix *ipv6_prfx; struct inet6_ifaddr *ifa; int i, max; if (!in6_dev) return -ENODEV; /* ipv6 prefix list starts behind smc_clc_msg_proposal_prefix */ ipv6_prfx = (struct smc_clc_ipv6_prefix *)((u8 *)prop + sizeof(*prop)); max = min_t(u8, prop->ipv6_prefixes_cnt, SMC_CLC_MAX_V6_PREFIX); list_for_each_entry(ifa, &in6_dev->addr_list, if_list) { if (ipv6_addr_type(&ifa->addr) & IPV6_ADDR_LINKLOCAL) continue; for (i = 0; i < max; i++) { if (ifa->prefix_len == ipv6_prfx[i].prefix_len && ipv6_prefix_equal(&ifa->addr, &ipv6_prfx[i].prefix, ifa->prefix_len)) return 0; } } #endif return -ENOENT; } /* check if proposed prefixes match one of our device prefixes */ int smc_clc_prfx_match(struct socket *clcsock, struct smc_clc_msg_proposal_prefix *prop) { struct dst_entry *dst = sk_dst_get(clcsock->sk); int rc; if (!dst) { rc = -ENOTCONN; goto out; } if (!dst->dev) { rc = -ENODEV; goto out_rel; } rcu_read_lock(); if (!prop->ipv6_prefixes_cnt) rc = smc_clc_prfx_match4_rcu(dst->dev, prop); else rc = smc_clc_prfx_match6_rcu(dst->dev, prop); rcu_read_unlock(); out_rel: dst_release(dst); out: return rc; } /* Wait for data on the tcp-socket, analyze received data * Returns: * 0 if success and it was not a decline that we received. * SMC_CLC_DECL_REPLY if decline received for fallback w/o another decl send. * clcsock error, -EINTR, -ECONNRESET, -EPROTO otherwise. */ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, u8 expected_type, unsigned long timeout) { long rcvtimeo = READ_ONCE(smc->clcsock->sk->sk_rcvtimeo); struct sock *clc_sk = smc->clcsock->sk; struct smc_clc_msg_hdr *clcm = buf; struct msghdr msg = {NULL, 0}; int reason_code = 0; struct kvec vec = {buf, buflen}; int len, datlen, recvlen; bool check_trl = true; int krflags; /* peek the first few bytes to determine length of data to receive * so we don't consume any subsequent CLC message or payload data * in the TCP byte stream */ /* * Caller must make sure that buflen is no less than * sizeof(struct smc_clc_msg_hdr) */ krflags = MSG_PEEK | MSG_WAITALL; WRITE_ONCE(clc_sk->sk_rcvtimeo, timeout); iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1, sizeof(struct smc_clc_msg_hdr)); len = sock_recvmsg(smc->clcsock, &msg, krflags); if (signal_pending(current)) { reason_code = -EINTR; clc_sk->sk_err = EINTR; smc->sk.sk_err = EINTR; goto out; } if (clc_sk->sk_err) { reason_code = -clc_sk->sk_err; if (clc_sk->sk_err == EAGAIN && expected_type == SMC_CLC_DECLINE) clc_sk->sk_err = 0; /* reset for fallback usage */ else smc->sk.sk_err = clc_sk->sk_err; goto out; } if (!len) { /* peer has performed orderly shutdown */ smc->sk.sk_err = ECONNRESET; reason_code = -ECONNRESET; goto out; } if (len < 0) { if (len != -EAGAIN || expected_type != SMC_CLC_DECLINE) smc->sk.sk_err = -len; reason_code = len; goto out; } datlen = ntohs(clcm->length); if ((len < sizeof(struct smc_clc_msg_hdr)) || (clcm->version < SMC_V1) || ((clcm->type != SMC_CLC_DECLINE) && (clcm->type != expected_type))) { smc->sk.sk_err = EPROTO; reason_code = -EPROTO; goto out; } /* receive the complete CLC message */ memset(&msg, 0, sizeof(struct msghdr)); if (datlen > buflen) { check_trl = false; recvlen = buflen; } else { recvlen = datlen; } iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1, recvlen); krflags = MSG_WAITALL; len = sock_recvmsg(smc->clcsock, &msg, krflags); if (len < recvlen || !smc_clc_msg_hdr_valid(clcm, check_trl)) { smc->sk.sk_err = EPROTO; reason_code = -EPROTO; goto out; } datlen -= len; while (datlen) { u8 tmp[SMC_CLC_RECV_BUF_LEN]; vec.iov_base = &tmp; vec.iov_len = SMC_CLC_RECV_BUF_LEN; /* receive remaining proposal message */ recvlen = datlen > SMC_CLC_RECV_BUF_LEN ? SMC_CLC_RECV_BUF_LEN : datlen; iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1, recvlen); len = sock_recvmsg(smc->clcsock, &msg, krflags); if (len < recvlen) { smc->sk.sk_err = EPROTO; reason_code = -EPROTO; goto out; } datlen -= len; } if (clcm->type == SMC_CLC_DECLINE) { struct smc_clc_msg_decline *dclc; dclc = (struct smc_clc_msg_decline *)clcm; reason_code = SMC_CLC_DECL_PEERDECL; smc->peer_diagnosis = ntohl(dclc->peer_diagnosis); if (((struct smc_clc_msg_decline *)buf)->hdr.typev2 & SMC_FIRST_CONTACT_MASK) { smc->conn.lgr->sync_err = 1; smc_lgr_terminate_sched(smc->conn.lgr); } } out: WRITE_ONCE(clc_sk->sk_rcvtimeo, rcvtimeo); return reason_code; } /* send CLC DECLINE message across internal TCP socket */ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version) { struct smc_clc_msg_decline *dclc_v1; struct smc_clc_msg_decline_v2 dclc; struct msghdr msg; int len, send_len; struct kvec vec; dclc_v1 = (struct smc_clc_msg_decline *)&dclc; memset(&dclc, 0, sizeof(dclc)); memcpy(dclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); dclc.hdr.type = SMC_CLC_DECLINE; dclc.hdr.version = version; dclc.os_type = version == SMC_V1 ? 0 : SMC_CLC_OS_LINUX; dclc.hdr.typev2 = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ? SMC_FIRST_CONTACT_MASK : 0; if ((!smc_conn_lgr_valid(&smc->conn) || !smc->conn.lgr->is_smcd) && smc_ib_is_valid_local_systemid()) memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid)); dclc.peer_diagnosis = htonl(peer_diag_info); if (version == SMC_V1) { memcpy(dclc_v1->trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); send_len = sizeof(*dclc_v1); } else { memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); send_len = sizeof(dclc); } dclc.hdr.length = htons(send_len); memset(&msg, 0, sizeof(msg)); vec.iov_base = &dclc; vec.iov_len = send_len; len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, send_len); if (len < 0 || len < send_len) len = -EPROTO; return len > 0 ? 0 : len; } /* send CLC PROPOSAL message across internal TCP socket */ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_clc_smcd_v2_extension *smcd_v2_ext; struct smc_clc_msg_proposal_prefix *pclc_prfx; struct smc_clc_msg_proposal *pclc_base; struct smc_clc_smcd_gid_chid *gidchids; struct smc_clc_msg_proposal_area *pclc; struct smc_clc_ipv6_prefix *ipv6_prfx; struct net *net = sock_net(&smc->sk); struct smc_clc_v2_extension *v2_ext; struct smc_clc_msg_smcd *pclc_smcd; struct smc_clc_msg_trail *trl; struct smcd_dev *smcd; int len, i, plen, rc; int reason_code = 0; struct kvec vec[8]; struct msghdr msg; pclc = kzalloc(sizeof(*pclc), GFP_KERNEL); if (!pclc) return -ENOMEM; pclc_base = &pclc->pclc_base; pclc_smcd = &pclc->pclc_smcd; pclc_prfx = &pclc->pclc_prfx; ipv6_prfx = pclc->pclc_prfx_ipv6; v2_ext = container_of(&pclc->pclc_v2_ext, struct smc_clc_v2_extension, fixed); smcd_v2_ext = container_of(&pclc->pclc_smcd_v2_ext, struct smc_clc_smcd_v2_extension, fixed); gidchids = pclc->pclc_gidchids; trl = &pclc->pclc_trl; pclc_base->hdr.version = SMC_V2; pclc_base->hdr.typev1 = ini->smc_type_v1; pclc_base->hdr.typev2 = ini->smc_type_v2; plen = sizeof(*pclc_base) + sizeof(*pclc_smcd) + sizeof(*trl); /* retrieve ip prefixes for CLC proposal msg */ if (ini->smc_type_v1 != SMC_TYPE_N) { rc = smc_clc_prfx_set(smc->clcsock, pclc_prfx, ipv6_prfx); if (rc) { if (ini->smc_type_v2 == SMC_TYPE_N) { kfree(pclc); return SMC_CLC_DECL_CNFERR; } pclc_base->hdr.typev1 = SMC_TYPE_N; } else { pclc_base->iparea_offset = htons(sizeof(*pclc_smcd)); plen += sizeof(*pclc_prfx) + pclc_prfx->ipv6_prefixes_cnt * sizeof(ipv6_prfx[0]); } } /* build SMC Proposal CLC message */ memcpy(pclc_base->hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); pclc_base->hdr.type = SMC_CLC_PROPOSAL; if (smcr_indicated(ini->smc_type_v1)) { /* add SMC-R specifics */ memcpy(pclc_base->lcl.id_for_peer, local_systemid, sizeof(local_systemid)); memcpy(pclc_base->lcl.gid, ini->ib_gid, SMC_GID_SIZE); memcpy(pclc_base->lcl.mac, &ini->ib_dev->mac[ini->ib_port - 1], ETH_ALEN); } if (smcd_indicated(ini->smc_type_v1)) { struct smcd_gid smcd_gid; /* add SMC-D specifics */ if (ini->ism_dev[0]) { smcd = ini->ism_dev[0]; smcd->ops->get_local_gid(smcd, &smcd_gid); pclc_smcd->ism.gid = htonll(smcd_gid.gid); pclc_smcd->ism.chid = htons(smc_ism_get_chid(ini->ism_dev[0])); } } if (ini->smc_type_v2 == SMC_TYPE_N) { pclc_smcd->v2_ext_offset = 0; } else { struct smc_clc_eid_entry *ueident; u16 v2_ext_offset; v2_ext->hdr.flag.release = SMC_RELEASE; v2_ext_offset = sizeof(*pclc_smcd) - offsetofend(struct smc_clc_msg_smcd, v2_ext_offset); if (ini->smc_type_v1 != SMC_TYPE_N) v2_ext_offset += sizeof(*pclc_prfx) + pclc_prfx->ipv6_prefixes_cnt * sizeof(ipv6_prfx[0]); pclc_smcd->v2_ext_offset = htons(v2_ext_offset); plen += sizeof(*v2_ext); v2_ext->feature_mask = htons(SMC_FEATURE_MASK); read_lock(&smc_clc_eid_table.lock); v2_ext->hdr.eid_cnt = smc_clc_eid_table.ueid_cnt; plen += smc_clc_eid_table.ueid_cnt * SMC_MAX_EID_LEN; i = 0; list_for_each_entry(ueident, &smc_clc_eid_table.list, list) { memcpy(v2_ext->user_eids[i++], ueident->eid, sizeof(ueident->eid)); } read_unlock(&smc_clc_eid_table.lock); } if (smcd_indicated(ini->smc_type_v2)) { struct smcd_gid smcd_gid; u8 *eid = NULL; int entry = 0; v2_ext->hdr.flag.seid = smc_clc_eid_table.seid_enabled; v2_ext->hdr.smcd_v2_ext_offset = htons(sizeof(*v2_ext) - offsetofend(struct smc_clnt_opts_area_hdr, smcd_v2_ext_offset) + v2_ext->hdr.eid_cnt * SMC_MAX_EID_LEN); smc_ism_get_system_eid(&eid); if (eid && v2_ext->hdr.flag.seid) memcpy(smcd_v2_ext->system_eid, eid, SMC_MAX_EID_LEN); plen += sizeof(*smcd_v2_ext); if (ini->ism_offered_cnt) { for (i = 1; i <= ini->ism_offered_cnt; i++) { smcd = ini->ism_dev[i]; smcd->ops->get_local_gid(smcd, &smcd_gid); gidchids[entry].chid = htons(smc_ism_get_chid(ini->ism_dev[i])); gidchids[entry].gid = htonll(smcd_gid.gid); if (smc_ism_is_emulated(smcd)) { /* an Emulated-ISM device takes two * entries. CHID of the second entry * repeats that of the first entry. */ gidchids[entry + 1].chid = gidchids[entry].chid; gidchids[entry + 1].gid = htonll(smcd_gid.gid_ext); entry++; } entry++; } plen += entry * sizeof(struct smc_clc_smcd_gid_chid); } v2_ext->hdr.ism_gid_cnt = entry; } if (smcr_indicated(ini->smc_type_v2)) { memcpy(v2_ext->roce, ini->smcrv2.ib_gid_v2, SMC_GID_SIZE); v2_ext->max_conns = net->smc.sysctl_max_conns_per_lgr; v2_ext->max_links = net->smc.sysctl_max_links_per_lgr; } pclc_base->hdr.length = htons(plen); memcpy(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); /* send SMC Proposal CLC message */ memset(&msg, 0, sizeof(msg)); i = 0; vec[i].iov_base = pclc_base; vec[i++].iov_len = sizeof(*pclc_base); vec[i].iov_base = pclc_smcd; vec[i++].iov_len = sizeof(*pclc_smcd); if (ini->smc_type_v1 != SMC_TYPE_N) { vec[i].iov_base = pclc_prfx; vec[i++].iov_len = sizeof(*pclc_prfx); if (pclc_prfx->ipv6_prefixes_cnt > 0) { vec[i].iov_base = ipv6_prfx; vec[i++].iov_len = pclc_prfx->ipv6_prefixes_cnt * sizeof(ipv6_prfx[0]); } } if (ini->smc_type_v2 != SMC_TYPE_N) { vec[i].iov_base = v2_ext; vec[i++].iov_len = sizeof(*v2_ext) + (v2_ext->hdr.eid_cnt * SMC_MAX_EID_LEN); if (smcd_indicated(ini->smc_type_v2)) { vec[i].iov_base = smcd_v2_ext; vec[i++].iov_len = sizeof(*smcd_v2_ext); if (ini->ism_offered_cnt) { vec[i].iov_base = gidchids; vec[i++].iov_len = v2_ext->hdr.ism_gid_cnt * sizeof(struct smc_clc_smcd_gid_chid); } } } vec[i].iov_base = trl; vec[i++].iov_len = sizeof(*trl); /* due to the few bytes needed for clc-handshake this cannot block */ len = kernel_sendmsg(smc->clcsock, &msg, vec, i, plen); if (len < 0) { smc->sk.sk_err = smc->clcsock->sk->sk_err; reason_code = -smc->sk.sk_err; } else if (len < ntohs(pclc_base->hdr.length)) { reason_code = -ENETUNREACH; smc->sk.sk_err = -reason_code; } kfree(pclc); return reason_code; } static void smcd_clc_prep_confirm_accept(struct smc_connection *conn, struct smc_clc_msg_accept_confirm *clc, int first_contact, u8 version, u8 *eid, struct smc_init_info *ini, int *fce_len, struct smc_clc_first_contact_ext_v2x *fce_v2x, struct smc_clc_msg_trail *trl) { struct smcd_dev *smcd = conn->lgr->smcd; struct smcd_gid smcd_gid; u16 chid; int len; /* SMC-D specific settings */ memcpy(clc->hdr.eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER)); smcd->ops->get_local_gid(smcd, &smcd_gid); clc->hdr.typev1 = SMC_TYPE_D; clc->d0.gid = htonll(smcd_gid.gid); clc->d0.token = htonll(conn->rmb_desc->token); clc->d0.dmbe_size = conn->rmbe_size_comp; clc->d0.dmbe_idx = 0; memcpy(&clc->d0.linkid, conn->lgr->id, SMC_LGR_ID_SIZE); if (version == SMC_V1) { clc->hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN); } else { chid = smc_ism_get_chid(smcd); clc->d1.chid = htons(chid); if (eid && eid[0]) memcpy(clc->d1.eid, eid, SMC_MAX_EID_LEN); if (__smc_ism_is_emulated(chid)) clc->d1.gid_ext = htonll(smcd_gid.gid_ext); len = SMCD_CLC_ACCEPT_CONFIRM_LEN_V2; if (first_contact) { *fce_len = smc_clc_fill_fce_v2x(fce_v2x, ini); len += *fce_len; } clc->hdr.length = htons(len); } memcpy(trl->eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER)); } static void smcr_clc_prep_confirm_accept(struct smc_connection *conn, struct smc_clc_msg_accept_confirm *clc, int first_contact, u8 version, u8 *eid, struct smc_init_info *ini, int *fce_len, struct smc_clc_first_contact_ext_v2x *fce_v2x, struct smc_clc_fce_gid_ext *gle, struct smc_clc_msg_trail *trl) { struct smc_link *link = conn->lnk; int len; /* SMC-R specific settings */ memcpy(clc->hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); clc->hdr.typev1 = SMC_TYPE_R; memcpy(clc->r0.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); memcpy(&clc->r0.lcl.gid, link->gid, SMC_GID_SIZE); memcpy(&clc->r0.lcl.mac, &link->smcibdev->mac[link->ibport - 1], ETH_ALEN); hton24(clc->r0.qpn, link->roce_qp->qp_num); clc->r0.rmb_rkey = htonl(conn->rmb_desc->mr[link->link_idx]->rkey); clc->r0.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ clc->r0.rmbe_alert_token = htonl(conn->alert_token_local); switch (clc->hdr.type) { case SMC_CLC_ACCEPT: clc->r0.qp_mtu = link->path_mtu; break; case SMC_CLC_CONFIRM: clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu); break; } clc->r0.rmbe_size = conn->rmbe_size_comp; clc->r0.rmb_dma_addr = conn->rmb_desc->is_vm ? cpu_to_be64((uintptr_t)conn->rmb_desc->cpu_addr) : cpu_to_be64((u64)sg_dma_address (conn->rmb_desc->sgt[link->link_idx].sgl)); hton24(clc->r0.psn, link->psn_initial); if (version == SMC_V1) { clc->hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN); } else { if (eid && eid[0]) memcpy(clc->r1.eid, eid, SMC_MAX_EID_LEN); len = SMCR_CLC_ACCEPT_CONFIRM_LEN_V2; if (first_contact) { *fce_len = smc_clc_fill_fce_v2x(fce_v2x, ini); len += *fce_len; fce_v2x->fce_v2_base.v2_direct = !link->lgr->uses_gateway; if (clc->hdr.type == SMC_CLC_CONFIRM) { memset(gle, 0, sizeof(*gle)); gle->gid_cnt = ini->smcrv2.gidlist.len; len += sizeof(*gle); len += gle->gid_cnt * sizeof(gle->gid[0]); } } clc->hdr.length = htons(len); } memcpy(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); } /* build and send CLC CONFIRM / ACCEPT message */ static int smc_clc_send_confirm_accept(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *clc, int first_contact, u8 version, u8 *eid, struct smc_init_info *ini) { struct smc_clc_first_contact_ext_v2x fce_v2x; struct smc_connection *conn = &smc->conn; struct smc_clc_fce_gid_ext gle; struct smc_clc_msg_trail trl; int i, fce_len; struct kvec vec[5]; struct msghdr msg; /* send SMC Confirm CLC msg */ clc->hdr.version = version; /* SMC version */ if (first_contact) clc->hdr.typev2 |= SMC_FIRST_CONTACT_MASK; if (conn->lgr->is_smcd) smcd_clc_prep_confirm_accept(conn, clc, first_contact, version, eid, ini, &fce_len, &fce_v2x, &trl); else smcr_clc_prep_confirm_accept(conn, clc, first_contact, version, eid, ini, &fce_len, &fce_v2x, &gle, &trl); memset(&msg, 0, sizeof(msg)); i = 0; vec[i].iov_base = clc; if (version > SMC_V1) vec[i++].iov_len = (clc->hdr.typev1 == SMC_TYPE_D ? SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 : SMCR_CLC_ACCEPT_CONFIRM_LEN_V2) - sizeof(trl); else vec[i++].iov_len = (clc->hdr.typev1 == SMC_TYPE_D ? SMCD_CLC_ACCEPT_CONFIRM_LEN : SMCR_CLC_ACCEPT_CONFIRM_LEN) - sizeof(trl); if (version > SMC_V1 && first_contact) { vec[i].iov_base = &fce_v2x; vec[i++].iov_len = fce_len; if (!conn->lgr->is_smcd) { if (clc->hdr.type == SMC_CLC_CONFIRM) { vec[i].iov_base = &gle; vec[i++].iov_len = sizeof(gle); vec[i].iov_base = &ini->smcrv2.gidlist.list; vec[i++].iov_len = gle.gid_cnt * sizeof(gle.gid[0]); } } } vec[i].iov_base = &trl; vec[i++].iov_len = sizeof(trl); return kernel_sendmsg(smc->clcsock, &msg, vec, 1, ntohs(clc->hdr.length)); } /* send CLC CONFIRM message across internal TCP socket */ int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact, u8 version, u8 *eid, struct smc_init_info *ini) { struct smc_clc_msg_accept_confirm cclc; int reason_code = 0; int len; /* send SMC Confirm CLC msg */ memset(&cclc, 0, sizeof(cclc)); cclc.hdr.type = SMC_CLC_CONFIRM; len = smc_clc_send_confirm_accept(smc, &cclc, clnt_first_contact, version, eid, ini); if (len < ntohs(cclc.hdr.length)) { if (len >= 0) { reason_code = -ENETUNREACH; smc->sk.sk_err = -reason_code; } else { smc->sk.sk_err = smc->clcsock->sk->sk_err; reason_code = -smc->sk.sk_err; } } return reason_code; } /* send CLC ACCEPT message across internal TCP socket */ int smc_clc_send_accept(struct smc_sock *new_smc, bool srv_first_contact, u8 version, u8 *negotiated_eid, struct smc_init_info *ini) { struct smc_clc_msg_accept_confirm aclc; int len; memset(&aclc, 0, sizeof(aclc)); aclc.hdr.type = SMC_CLC_ACCEPT; len = smc_clc_send_confirm_accept(new_smc, &aclc, srv_first_contact, version, negotiated_eid, ini); if (len < ntohs(aclc.hdr.length)) len = len >= 0 ? -EPROTO : -new_smc->clcsock->sk->sk_err; return len > 0 ? 0 : len; } int smc_clc_srv_v2x_features_validate(struct smc_sock *smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { struct smc_clc_v2_extension *pclc_v2_ext; struct net *net = sock_net(&smc->sk); ini->max_conns = SMC_CONN_PER_LGR_MAX; ini->max_links = SMC_LINKS_ADD_LNK_MAX; ini->feature_mask = SMC_FEATURE_MASK; if ((!(ini->smcd_version & SMC_V2) && !(ini->smcr_version & SMC_V2)) || ini->release_nr < SMC_RELEASE_1) return 0; pclc_v2_ext = smc_get_clc_v2_ext(pclc); if (!pclc_v2_ext) return SMC_CLC_DECL_NOV2EXT; if (ini->smcr_version & SMC_V2) { ini->max_conns = min_t(u8, pclc_v2_ext->max_conns, net->smc.sysctl_max_conns_per_lgr); if (ini->max_conns < SMC_CONN_PER_LGR_MIN) return SMC_CLC_DECL_MAXCONNERR; ini->max_links = min_t(u8, pclc_v2_ext->max_links, net->smc.sysctl_max_links_per_lgr); if (ini->max_links < SMC_LINKS_ADD_LNK_MIN) return SMC_CLC_DECL_MAXLINKERR; } return 0; } int smc_clc_clnt_v2x_features_validate(struct smc_clc_first_contact_ext *fce, struct smc_init_info *ini) { struct smc_clc_first_contact_ext_v2x *fce_v2x = (struct smc_clc_first_contact_ext_v2x *)fce; if (ini->release_nr < SMC_RELEASE_1) return 0; if (!ini->is_smcd) { if (fce_v2x->max_conns < SMC_CONN_PER_LGR_MIN) return SMC_CLC_DECL_MAXCONNERR; ini->max_conns = fce_v2x->max_conns; if (fce_v2x->max_links > SMC_LINKS_ADD_LNK_MAX || fce_v2x->max_links < SMC_LINKS_ADD_LNK_MIN) return SMC_CLC_DECL_MAXLINKERR; ini->max_links = fce_v2x->max_links; } /* common supplemental features of server and client */ ini->feature_mask = ntohs(fce_v2x->feature_mask) & SMC_FEATURE_MASK; return 0; } int smc_clc_v2x_features_confirm_check(struct smc_clc_msg_accept_confirm *cclc, struct smc_init_info *ini) { struct smc_clc_first_contact_ext *fce = smc_get_clc_first_contact_ext(cclc, ini->is_smcd); struct smc_clc_first_contact_ext_v2x *fce_v2x = (struct smc_clc_first_contact_ext_v2x *)fce; if (cclc->hdr.version == SMC_V1 || !(cclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK)) return 0; if (ini->release_nr != fce->release) return SMC_CLC_DECL_RELEASEERR; if (fce->release < SMC_RELEASE_1) return 0; if (!ini->is_smcd) { if (fce_v2x->max_conns != ini->max_conns) return SMC_CLC_DECL_MAXCONNERR; if (fce_v2x->max_links != ini->max_links) return SMC_CLC_DECL_MAXLINKERR; } /* common supplemental features returned by client */ ini->feature_mask = ntohs(fce_v2x->feature_mask); return 0; } void smc_clc_get_hostname(u8 **host) { *host = &smc_hostname[0]; } void __init smc_clc_init(void) { struct new_utsname *u; memset(smc_hostname, _S, sizeof(smc_hostname)); /* ASCII blanks */ u = utsname(); memcpy(smc_hostname, u->nodename, min_t(size_t, strlen(u->nodename), sizeof(smc_hostname))); INIT_LIST_HEAD(&smc_clc_eid_table.list); rwlock_init(&smc_clc_eid_table.lock); smc_clc_eid_table.ueid_cnt = 0; #if IS_ENABLED(CONFIG_S390) smc_clc_eid_table.seid_enabled = 1; #else smc_clc_eid_table.seid_enabled = 0; #endif } void smc_clc_exit(void) { smc_clc_ueid_remove(NULL); }
17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM oom #if !defined(_TRACE_OOM_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_OOM_H #include <linux/tracepoint.h> #include <trace/events/mmflags.h> #define PG_COUNT_TO_KB(x) ((x) << (PAGE_SHIFT - 10)) TRACE_EVENT(oom_score_adj_update, TP_PROTO(struct task_struct *task), TP_ARGS(task), TP_STRUCT__entry( __field( pid_t, pid) __array( char, comm, TASK_COMM_LEN ) __field( short, oom_score_adj) ), TP_fast_assign( __entry->pid = task->pid; memcpy(__entry->comm, task->comm, TASK_COMM_LEN); __entry->oom_score_adj = task->signal->oom_score_adj; ), TP_printk("pid=%d comm=%s oom_score_adj=%hd", __entry->pid, __entry->comm, __entry->oom_score_adj) ); TRACE_EVENT(reclaim_retry_zone, TP_PROTO(struct zoneref *zoneref, int order, unsigned long reclaimable, unsigned long available, unsigned long min_wmark, int no_progress_loops, bool wmark_check), TP_ARGS(zoneref, order, reclaimable, available, min_wmark, no_progress_loops, wmark_check), TP_STRUCT__entry( __field( int, node) __field( int, zone_idx) __field( int, order) __field( unsigned long, reclaimable) __field( unsigned long, available) __field( unsigned long, min_wmark) __field( int, no_progress_loops) __field( bool, wmark_check) ), TP_fast_assign( __entry->node = zonelist_node_idx(zoneref); __entry->zone_idx = zonelist_zone_idx(zoneref); __entry->order = order; __entry->reclaimable = reclaimable; __entry->available = available; __entry->min_wmark = min_wmark; __entry->no_progress_loops = no_progress_loops; __entry->wmark_check = wmark_check; ), TP_printk("node=%d zone=%-8s order=%d reclaimable=%lu available=%lu min_wmark=%lu no_progress_loops=%d wmark_check=%d", __entry->node, __print_symbolic(__entry->zone_idx, ZONE_TYPE), __entry->order, __entry->reclaimable, __entry->available, __entry->min_wmark, __entry->no_progress_loops, __entry->wmark_check) ); TRACE_EVENT(mark_victim, TP_PROTO(struct task_struct *task, uid_t uid), TP_ARGS(task, uid), TP_STRUCT__entry( __field(int, pid) __string(comm, task->comm) __field(unsigned long, total_vm) __field(unsigned long, anon_rss) __field(unsigned long, file_rss) __field(unsigned long, shmem_rss) __field(uid_t, uid) __field(unsigned long, pgtables) __field(short, oom_score_adj) ), TP_fast_assign( __entry->pid = task->pid; __assign_str(comm); __entry->total_vm = PG_COUNT_TO_KB(task->mm->total_vm); __entry->anon_rss = PG_COUNT_TO_KB(get_mm_counter(task->mm, MM_ANONPAGES)); __entry->file_rss = PG_COUNT_TO_KB(get_mm_counter(task->mm, MM_FILEPAGES)); __entry->shmem_rss = PG_COUNT_TO_KB(get_mm_counter(task->mm, MM_SHMEMPAGES)); __entry->uid = uid; __entry->pgtables = mm_pgtables_bytes(task->mm) >> 10; __entry->oom_score_adj = task->signal->oom_score_adj; ), TP_printk("pid=%d comm=%s total-vm=%lukB anon-rss=%lukB file-rss:%lukB shmem-rss:%lukB uid=%u pgtables=%lukB oom_score_adj=%hd", __entry->pid, __get_str(comm), __entry->total_vm, __entry->anon_rss, __entry->file_rss, __entry->shmem_rss, __entry->uid, __entry->pgtables, __entry->oom_score_adj ) ); TRACE_EVENT(wake_reaper, TP_PROTO(int pid), TP_ARGS(pid), TP_STRUCT__entry( __field(int, pid) ), TP_fast_assign( __entry->pid = pid; ), TP_printk("pid=%d", __entry->pid) ); TRACE_EVENT(start_task_reaping, TP_PROTO(int pid), TP_ARGS(pid), TP_STRUCT__entry( __field(int, pid) ), TP_fast_assign( __entry->pid = pid; ), TP_printk("pid=%d", __entry->pid) ); TRACE_EVENT(finish_task_reaping, TP_PROTO(int pid), TP_ARGS(pid), TP_STRUCT__entry( __field(int, pid) ), TP_fast_assign( __entry->pid = pid; ), TP_printk("pid=%d", __entry->pid) ); TRACE_EVENT(skip_task_reaping, TP_PROTO(int pid), TP_ARGS(pid), TP_STRUCT__entry( __field(int, pid) ), TP_fast_assign( __entry->pid = pid; ), TP_printk("pid=%d", __entry->pid) ); #ifdef CONFIG_COMPACTION TRACE_EVENT(compact_retry, TP_PROTO(int order, enum compact_priority priority, enum compact_result result, int retries, int max_retries, bool ret), TP_ARGS(order, priority, result, retries, max_retries, ret), TP_STRUCT__entry( __field( int, order) __field( int, priority) __field( int, result) __field( int, retries) __field( int, max_retries) __field( bool, ret) ), TP_fast_assign( __entry->order = order; __entry->priority = priority; __entry->result = compact_result_to_feedback(result); __entry->retries = retries; __entry->max_retries = max_retries; __entry->ret = ret; ), TP_printk("order=%d priority=%s compaction_result=%s retries=%d max_retries=%d should_retry=%d", __entry->order, __print_symbolic(__entry->priority, COMPACTION_PRIORITY), __print_symbolic(__entry->result, COMPACTION_FEEDBACK), __entry->retries, __entry->max_retries, __entry->ret) ); #endif /* CONFIG_COMPACTION */ #endif /* This part must be outside protection */ #include <trace/define_trace.h>
2 2 2 2 3 2 2 2 3 2 2 2 1 2 5 2 2 2 2 2 2 2 16 16 16 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 13 2 15 15 15 15 15 15 15 14 14 13 2 13 13 13 13 17 17 17 17 12 12 14 16 16 16 16 16 16 16 16 16 13 13 3 4 3 2 2 2 2 2 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) International Business Machines Corp., 2006 * Copyright (c) Nokia Corporation, 2007 * * Author: Artem Bityutskiy (Битюцкий Артём), * Frank Haverkamp */ /* * This file includes UBI initialization and building of UBI devices. * * When UBI is initialized, it attaches all the MTD devices specified as the * module load parameters or the kernel boot parameters. If MTD devices were * specified, UBI does not attach any MTD device, but it is possible to do * later using the "UBI control device". */ #include <linux/err.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/stringify.h> #include <linux/namei.h> #include <linux/stat.h> #include <linux/miscdevice.h> #include <linux/mtd/partitions.h> #include <linux/log2.h> #include <linux/kthread.h> #include <linux/kernel.h> #include <linux/of.h> #include <linux/slab.h> #include <linux/major.h> #include "ubi.h" /* Maximum length of the 'mtd=' parameter */ #define MTD_PARAM_LEN_MAX 64 /* Maximum number of comma-separated items in the 'mtd=' parameter */ #define MTD_PARAM_MAX_COUNT 6 /* Maximum value for the number of bad PEBs per 1024 PEBs */ #define MAX_MTD_UBI_BEB_LIMIT 768 #ifdef CONFIG_MTD_UBI_MODULE #define ubi_is_module() 1 #else #define ubi_is_module() 0 #endif /** * struct mtd_dev_param - MTD device parameter description data structure. * @name: MTD character device node path, MTD device name, or MTD device number * string * @ubi_num: UBI number * @vid_hdr_offs: VID header offset * @max_beb_per1024: maximum expected number of bad PEBs per 1024 PEBs * @enable_fm: enable fastmap when value is non-zero * @need_resv_pool: reserve pool->max_size pebs when value is none-zero */ struct mtd_dev_param { char name[MTD_PARAM_LEN_MAX]; int ubi_num; int vid_hdr_offs; int max_beb_per1024; int enable_fm; int need_resv_pool; }; /* Numbers of elements set in the @mtd_dev_param array */ static int mtd_devs; /* MTD devices specification parameters */ static struct mtd_dev_param mtd_dev_param[UBI_MAX_DEVICES]; #ifdef CONFIG_MTD_UBI_FASTMAP /* UBI module parameter to enable fastmap automatically on non-fastmap images */ static bool fm_autoconvert; static bool fm_debug; #endif /* Slab cache for wear-leveling entries */ struct kmem_cache *ubi_wl_entry_slab; /* UBI control character device */ static struct miscdevice ubi_ctrl_cdev = { .minor = MISC_DYNAMIC_MINOR, .name = "ubi_ctrl", .fops = &ubi_ctrl_cdev_operations, }; /* All UBI devices in system */ static struct ubi_device *ubi_devices[UBI_MAX_DEVICES]; /* Serializes UBI devices creations and removals */ DEFINE_MUTEX(ubi_devices_mutex); /* Protects @ubi_devices, @ubi->ref_count and @ubi->is_dead */ static DEFINE_SPINLOCK(ubi_devices_lock); /* "Show" method for files in '/<sysfs>/class/ubi/' */ /* UBI version attribute ('/<sysfs>/class/ubi/version') */ static ssize_t version_show(const struct class *class, const struct class_attribute *attr, char *buf) { return sprintf(buf, "%d\n", UBI_VERSION); } static CLASS_ATTR_RO(version); static struct attribute *ubi_class_attrs[] = { &class_attr_version.attr, NULL, }; ATTRIBUTE_GROUPS(ubi_class); /* Root UBI "class" object (corresponds to '/<sysfs>/class/ubi/') */ const struct class ubi_class = { .name = UBI_NAME_STR, .class_groups = ubi_class_groups, }; static ssize_t dev_attribute_show(struct device *dev, struct device_attribute *attr, char *buf); /* UBI device attributes (correspond to files in '/<sysfs>/class/ubi/ubiX') */ static struct device_attribute dev_eraseblock_size = __ATTR(eraseblock_size, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_avail_eraseblocks = __ATTR(avail_eraseblocks, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_total_eraseblocks = __ATTR(total_eraseblocks, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_volumes_count = __ATTR(volumes_count, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_max_ec = __ATTR(max_ec, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_reserved_for_bad = __ATTR(reserved_for_bad, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_bad_peb_count = __ATTR(bad_peb_count, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_max_vol_count = __ATTR(max_vol_count, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_min_io_size = __ATTR(min_io_size, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_bgt_enabled = __ATTR(bgt_enabled, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_mtd_num = __ATTR(mtd_num, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_ro_mode = __ATTR(ro_mode, S_IRUGO, dev_attribute_show, NULL); /** * ubi_volume_notify - send a volume change notification. * @ubi: UBI device description object * @vol: volume description object of the changed volume * @ntype: notification type to send (%UBI_VOLUME_ADDED, etc) * * This is a helper function which notifies all subscribers about a volume * change event (creation, removal, re-sizing, re-naming, updating). Returns * zero in case of success and a negative error code in case of failure. */ int ubi_volume_notify(struct ubi_device *ubi, struct ubi_volume *vol, int ntype) { int ret; struct ubi_notification nt; ubi_do_get_device_info(ubi, &nt.di); ubi_do_get_volume_info(ubi, vol, &nt.vi); switch (ntype) { case UBI_VOLUME_ADDED: case UBI_VOLUME_REMOVED: case UBI_VOLUME_RESIZED: case UBI_VOLUME_RENAMED: ret = ubi_update_fastmap(ubi); if (ret) ubi_msg(ubi, "Unable to write a new fastmap: %i", ret); } return blocking_notifier_call_chain(&ubi_notifiers, ntype, &nt); } /** * ubi_notify_all - send a notification to all volumes. * @ubi: UBI device description object * @ntype: notification type to send (%UBI_VOLUME_ADDED, etc) * @nb: the notifier to call * * This function walks all volumes of UBI device @ubi and sends the @ntype * notification for each volume. If @nb is %NULL, then all registered notifiers * are called, otherwise only the @nb notifier is called. Returns the number of * sent notifications. */ int ubi_notify_all(struct ubi_device *ubi, int ntype, struct notifier_block *nb) { struct ubi_notification nt; int i, count = 0; ubi_do_get_device_info(ubi, &nt.di); mutex_lock(&ubi->device_mutex); for (i = 0; i < ubi->vtbl_slots; i++) { /* * Since the @ubi->device is locked, and we are not going to * change @ubi->volumes, we do not have to lock * @ubi->volumes_lock. */ if (!ubi->volumes[i]) continue; ubi_do_get_volume_info(ubi, ubi->volumes[i], &nt.vi); if (nb) nb->notifier_call(nb, ntype, &nt); else blocking_notifier_call_chain(&ubi_notifiers, ntype, &nt); count += 1; } mutex_unlock(&ubi->device_mutex); return count; } /** * ubi_enumerate_volumes - send "add" notification for all existing volumes. * @nb: the notifier to call * * This function walks all UBI devices and volumes and sends the * %UBI_VOLUME_ADDED notification for each volume. If @nb is %NULL, then all * registered notifiers are called, otherwise only the @nb notifier is called. * Returns the number of sent notifications. */ int ubi_enumerate_volumes(struct notifier_block *nb) { int i, count = 0; /* * Since the @ubi_devices_mutex is locked, and we are not going to * change @ubi_devices, we do not have to lock @ubi_devices_lock. */ for (i = 0; i < UBI_MAX_DEVICES; i++) { struct ubi_device *ubi = ubi_devices[i]; if (!ubi) continue; count += ubi_notify_all(ubi, UBI_VOLUME_ADDED, nb); } return count; } /** * ubi_get_device - get UBI device. * @ubi_num: UBI device number * * This function returns UBI device description object for UBI device number * @ubi_num, or %NULL if the device does not exist. This function increases the * device reference count to prevent removal of the device. In other words, the * device cannot be removed if its reference count is not zero. */ struct ubi_device *ubi_get_device(int ubi_num) { struct ubi_device *ubi; spin_lock(&ubi_devices_lock); ubi = ubi_devices[ubi_num]; if (ubi && ubi->is_dead) ubi = NULL; if (ubi) { ubi_assert(ubi->ref_count >= 0); ubi->ref_count += 1; get_device(&ubi->dev); } spin_unlock(&ubi_devices_lock); return ubi; } /** * ubi_put_device - drop an UBI device reference. * @ubi: UBI device description object */ void ubi_put_device(struct ubi_device *ubi) { spin_lock(&ubi_devices_lock); ubi->ref_count -= 1; put_device(&ubi->dev); spin_unlock(&ubi_devices_lock); } /** * ubi_get_by_major - get UBI device by character device major number. * @major: major number * * This function is similar to 'ubi_get_device()', but it searches the device * by its major number. */ struct ubi_device *ubi_get_by_major(int major) { int i; struct ubi_device *ubi; spin_lock(&ubi_devices_lock); for (i = 0; i < UBI_MAX_DEVICES; i++) { ubi = ubi_devices[i]; if (ubi && !ubi->is_dead && MAJOR(ubi->cdev.dev) == major) { ubi_assert(ubi->ref_count >= 0); ubi->ref_count += 1; get_device(&ubi->dev); spin_unlock(&ubi_devices_lock); return ubi; } } spin_unlock(&ubi_devices_lock); return NULL; } /** * ubi_major2num - get UBI device number by character device major number. * @major: major number * * This function searches UBI device number object by its major number. If UBI * device was not found, this function returns -ENODEV, otherwise the UBI device * number is returned. */ int ubi_major2num(int major) { int i, ubi_num = -ENODEV; spin_lock(&ubi_devices_lock); for (i = 0; i < UBI_MAX_DEVICES; i++) { struct ubi_device *ubi = ubi_devices[i]; if (ubi && !ubi->is_dead && MAJOR(ubi->cdev.dev) == major) { ubi_num = ubi->ubi_num; break; } } spin_unlock(&ubi_devices_lock); return ubi_num; } /* "Show" method for files in '/<sysfs>/class/ubi/ubiX/' */ static ssize_t dev_attribute_show(struct device *dev, struct device_attribute *attr, char *buf) { ssize_t ret; struct ubi_device *ubi; /* * The below code looks weird, but it actually makes sense. We get the * UBI device reference from the contained 'struct ubi_device'. But it * is unclear if the device was removed or not yet. Indeed, if the * device was removed before we increased its reference count, * 'ubi_get_device()' will return -ENODEV and we fail. * * Remember, 'struct ubi_device' is freed in the release function, so * we still can use 'ubi->ubi_num'. */ ubi = container_of(dev, struct ubi_device, dev); if (attr == &dev_eraseblock_size) ret = sprintf(buf, "%d\n", ubi->leb_size); else if (attr == &dev_avail_eraseblocks) ret = sprintf(buf, "%d\n", ubi->avail_pebs); else if (attr == &dev_total_eraseblocks) ret = sprintf(buf, "%d\n", ubi->good_peb_count); else if (attr == &dev_volumes_count) ret = sprintf(buf, "%d\n", ubi->vol_count - UBI_INT_VOL_COUNT); else if (attr == &dev_max_ec) ret = sprintf(buf, "%d\n", ubi->max_ec); else if (attr == &dev_reserved_for_bad) ret = sprintf(buf, "%d\n", ubi->beb_rsvd_pebs); else if (attr == &dev_bad_peb_count) ret = sprintf(buf, "%d\n", ubi->bad_peb_count); else if (attr == &dev_max_vol_count) ret = sprintf(buf, "%d\n", ubi->vtbl_slots); else if (attr == &dev_min_io_size) ret = sprintf(buf, "%d\n", ubi->min_io_size); else if (attr == &dev_bgt_enabled) ret = sprintf(buf, "%d\n", ubi->thread_enabled); else if (attr == &dev_mtd_num) ret = sprintf(buf, "%d\n", ubi->mtd->index); else if (attr == &dev_ro_mode) ret = sprintf(buf, "%d\n", ubi->ro_mode); else ret = -EINVAL; return ret; } static struct attribute *ubi_dev_attrs[] = { &dev_eraseblock_size.attr, &dev_avail_eraseblocks.attr, &dev_total_eraseblocks.attr, &dev_volumes_count.attr, &dev_max_ec.attr, &dev_reserved_for_bad.attr, &dev_bad_peb_count.attr, &dev_max_vol_count.attr, &dev_min_io_size.attr, &dev_bgt_enabled.attr, &dev_mtd_num.attr, &dev_ro_mode.attr, NULL }; ATTRIBUTE_GROUPS(ubi_dev); static void dev_release(struct device *dev) { struct ubi_device *ubi = container_of(dev, struct ubi_device, dev); kfree(ubi); } /** * kill_volumes - destroy all user volumes. * @ubi: UBI device description object */ static void kill_volumes(struct ubi_device *ubi) { int i; for (i = 0; i < ubi->vtbl_slots; i++) if (ubi->volumes[i]) ubi_free_volume(ubi, ubi->volumes[i]); } /** * uif_init - initialize user interfaces for an UBI device. * @ubi: UBI device description object * * This function initializes various user interfaces for an UBI device. If the * initialization fails at an early stage, this function frees all the * resources it allocated, returns an error. * * This function returns zero in case of success and a negative error code in * case of failure. */ static int uif_init(struct ubi_device *ubi) { int i, err; dev_t dev; sprintf(ubi->ubi_name, UBI_NAME_STR "%d", ubi->ubi_num); /* * Major numbers for the UBI character devices are allocated * dynamically. Major numbers of volume character devices are * equivalent to ones of the corresponding UBI character device. Minor * numbers of UBI character devices are 0, while minor numbers of * volume character devices start from 1. Thus, we allocate one major * number and ubi->vtbl_slots + 1 minor numbers. */ err = alloc_chrdev_region(&dev, 0, ubi->vtbl_slots + 1, ubi->ubi_name); if (err) { ubi_err(ubi, "cannot register UBI character devices"); return err; } ubi->dev.devt = dev; ubi_assert(MINOR(dev) == 0); cdev_init(&ubi->cdev, &ubi_cdev_operations); dbg_gen("%s major is %u", ubi->ubi_name, MAJOR(dev)); ubi->cdev.owner = THIS_MODULE; dev_set_name(&ubi->dev, UBI_NAME_STR "%d", ubi->ubi_num); err = cdev_device_add(&ubi->cdev, &ubi->dev); if (err) goto out_unreg; for (i = 0; i < ubi->vtbl_slots; i++) if (ubi->volumes[i]) { err = ubi_add_volume(ubi, ubi->volumes[i]); if (err) { ubi_err(ubi, "cannot add volume %d", i); ubi->volumes[i] = NULL; goto out_volumes; } } return 0; out_volumes: kill_volumes(ubi); cdev_device_del(&ubi->cdev, &ubi->dev); out_unreg: unregister_chrdev_region(ubi->cdev.dev, ubi->vtbl_slots + 1); ubi_err(ubi, "cannot initialize UBI %s, error %d", ubi->ubi_name, err); return err; } /** * uif_close - close user interfaces for an UBI device. * @ubi: UBI device description object * * Note, since this function un-registers UBI volume device objects (@vol->dev), * the memory allocated voe the volumes is freed as well (in the release * function). */ static void uif_close(struct ubi_device *ubi) { kill_volumes(ubi); cdev_device_del(&ubi->cdev, &ubi->dev); unregister_chrdev_region(ubi->cdev.dev, ubi->vtbl_slots + 1); } /** * ubi_free_volumes_from - free volumes from specific index. * @ubi: UBI device description object * @from: the start index used for volume free. */ static void ubi_free_volumes_from(struct ubi_device *ubi, int from) { int i; for (i = from; i < ubi->vtbl_slots + UBI_INT_VOL_COUNT; i++) { if (!ubi->volumes[i] || ubi->volumes[i]->is_dead) continue; ubi_eba_replace_table(ubi->volumes[i], NULL); ubi_fastmap_destroy_checkmap(ubi->volumes[i]); kfree(ubi->volumes[i]); ubi->volumes[i] = NULL; } } /** * ubi_free_all_volumes - free all volumes. * @ubi: UBI device description object */ void ubi_free_all_volumes(struct ubi_device *ubi) { ubi_free_volumes_from(ubi, 0); } /** * ubi_free_internal_volumes - free internal volumes. * @ubi: UBI device description object */ void ubi_free_internal_volumes(struct ubi_device *ubi) { ubi_free_volumes_from(ubi, ubi->vtbl_slots); } static int get_bad_peb_limit(const struct ubi_device *ubi, int max_beb_per1024) { int limit, device_pebs; uint64_t device_size; if (!max_beb_per1024) { /* * Since max_beb_per1024 has not been set by the user in either * the cmdline or Kconfig, use mtd_max_bad_blocks to set the * limit if it is supported by the device. */ limit = mtd_max_bad_blocks(ubi->mtd, 0, ubi->mtd->size); if (limit < 0) return 0; return limit; } /* * Here we are using size of the entire flash chip and * not just the MTD partition size because the maximum * number of bad eraseblocks is a percentage of the * whole device and bad eraseblocks are not fairly * distributed over the flash chip. So the worst case * is that all the bad eraseblocks of the chip are in * the MTD partition we are attaching (ubi->mtd). */ device_size = mtd_get_device_size(ubi->mtd); device_pebs = mtd_div_by_eb(device_size, ubi->mtd); limit = mult_frac(device_pebs, max_beb_per1024, 1024); /* Round it up */ if (mult_frac(limit, 1024, max_beb_per1024) < device_pebs) limit += 1; return limit; } /** * io_init - initialize I/O sub-system for a given UBI device. * @ubi: UBI device description object * @max_beb_per1024: maximum expected number of bad PEB per 1024 PEBs * * If @ubi->vid_hdr_offset or @ubi->leb_start is zero, default offsets are * assumed: * o EC header is always at offset zero - this cannot be changed; * o VID header starts just after the EC header at the closest address * aligned to @io->hdrs_min_io_size; * o data starts just after the VID header at the closest address aligned to * @io->min_io_size * * This function returns zero in case of success and a negative error code in * case of failure. */ static int io_init(struct ubi_device *ubi, int max_beb_per1024) { dbg_gen("sizeof(struct ubi_ainf_peb) %zu", sizeof(struct ubi_ainf_peb)); dbg_gen("sizeof(struct ubi_wl_entry) %zu", sizeof(struct ubi_wl_entry)); if (ubi->mtd->numeraseregions != 0) { /* * Some flashes have several erase regions. Different regions * may have different eraseblock size and other * characteristics. It looks like mostly multi-region flashes * have one "main" region and one or more small regions to * store boot loader code or boot parameters or whatever. I * guess we should just pick the largest region. But this is * not implemented. */ ubi_err(ubi, "multiple regions, not implemented"); return -EINVAL; } if (ubi->vid_hdr_offset < 0) return -EINVAL; /* * Note, in this implementation we support MTD devices with 0x7FFFFFFF * physical eraseblocks maximum. */ ubi->peb_size = ubi->mtd->erasesize; ubi->peb_count = mtd_div_by_eb(ubi->mtd->size, ubi->mtd); ubi->flash_size = ubi->mtd->size; if (mtd_can_have_bb(ubi->mtd)) { ubi->bad_allowed = 1; ubi->bad_peb_limit = get_bad_peb_limit(ubi, max_beb_per1024); } if (ubi->mtd->type == MTD_NORFLASH) ubi->nor_flash = 1; ubi->min_io_size = ubi->mtd->writesize; ubi->hdrs_min_io_size = ubi->mtd->writesize >> ubi->mtd->subpage_sft; /* * Make sure minimal I/O unit is power of 2. Note, there is no * fundamental reason for this assumption. It is just an optimization * which allows us to avoid costly division operations. */ if (!is_power_of_2(ubi->min_io_size)) { ubi_err(ubi, "min. I/O unit (%d) is not power of 2", ubi->min_io_size); return -EINVAL; } ubi_assert(ubi->hdrs_min_io_size > 0); ubi_assert(ubi->hdrs_min_io_size <= ubi->min_io_size); ubi_assert(ubi->min_io_size % ubi->hdrs_min_io_size == 0); ubi->max_write_size = ubi->mtd->writebufsize; /* * Maximum write size has to be greater or equivalent to min. I/O * size, and be multiple of min. I/O size. */ if (ubi->max_write_size < ubi->min_io_size || ubi->max_write_size % ubi->min_io_size || !is_power_of_2(ubi->max_write_size)) { ubi_err(ubi, "bad write buffer size %d for %d min. I/O unit", ubi->max_write_size, ubi->min_io_size); return -EINVAL; } /* Calculate default aligned sizes of EC and VID headers */ ubi->ec_hdr_alsize = ALIGN(UBI_EC_HDR_SIZE, ubi->hdrs_min_io_size); ubi->vid_hdr_alsize = ALIGN(UBI_VID_HDR_SIZE, ubi->hdrs_min_io_size); dbg_gen("min_io_size %d", ubi->min_io_size); dbg_gen("max_write_size %d", ubi->max_write_size); dbg_gen("hdrs_min_io_size %d", ubi->hdrs_min_io_size); dbg_gen("ec_hdr_alsize %d", ubi->ec_hdr_alsize); dbg_gen("vid_hdr_alsize %d", ubi->vid_hdr_alsize); if (ubi->vid_hdr_offset == 0) /* Default offset */ ubi->vid_hdr_offset = ubi->vid_hdr_aloffset = ubi->ec_hdr_alsize; else { ubi->vid_hdr_aloffset = ubi->vid_hdr_offset & ~(ubi->hdrs_min_io_size - 1); ubi->vid_hdr_shift = ubi->vid_hdr_offset - ubi->vid_hdr_aloffset; } /* * Memory allocation for VID header is ubi->vid_hdr_alsize * which is described in comments in io.c. * Make sure VID header shift + UBI_VID_HDR_SIZE not exceeds * ubi->vid_hdr_alsize, so that all vid header operations * won't access memory out of bounds. */ if ((ubi->vid_hdr_shift + UBI_VID_HDR_SIZE) > ubi->vid_hdr_alsize) { ubi_err(ubi, "Invalid VID header offset %d, VID header shift(%d)" " + VID header size(%zu) > VID header aligned size(%d).", ubi->vid_hdr_offset, ubi->vid_hdr_shift, UBI_VID_HDR_SIZE, ubi->vid_hdr_alsize); return -EINVAL; } /* Similar for the data offset */ ubi->leb_start = ubi->vid_hdr_offset + UBI_VID_HDR_SIZE; ubi->leb_start = ALIGN(ubi->leb_start, ubi->min_io_size); dbg_gen("vid_hdr_offset %d", ubi->vid_hdr_offset); dbg_gen("vid_hdr_aloffset %d", ubi->vid_hdr_aloffset); dbg_gen("vid_hdr_shift %d", ubi->vid_hdr_shift); dbg_gen("leb_start %d", ubi->leb_start); /* The shift must be aligned to 32-bit boundary */ if (ubi->vid_hdr_shift % 4) { ubi_err(ubi, "unaligned VID header shift %d", ubi->vid_hdr_shift); return -EINVAL; } /* Check sanity */ if (ubi->vid_hdr_offset < UBI_EC_HDR_SIZE || ubi->leb_start < ubi->vid_hdr_offset + UBI_VID_HDR_SIZE || ubi->leb_start > ubi->peb_size - UBI_VID_HDR_SIZE || ubi->leb_start & (ubi->min_io_size - 1)) { ubi_err(ubi, "bad VID header (%d) or data offsets (%d)", ubi->vid_hdr_offset, ubi->leb_start); return -EINVAL; } /* * Set maximum amount of physical erroneous eraseblocks to be 10%. * Erroneous PEB are those which have read errors. */ ubi->max_erroneous = ubi->peb_count / 10; if (ubi->max_erroneous < 16) ubi->max_erroneous = 16; dbg_gen("max_erroneous %d", ubi->max_erroneous); /* * It may happen that EC and VID headers are situated in one minimal * I/O unit. In this case we can only accept this UBI image in * read-only mode. */ if (ubi->vid_hdr_offset + UBI_VID_HDR_SIZE <= ubi->hdrs_min_io_size) { ubi_warn(ubi, "EC and VID headers are in the same minimal I/O unit, switch to read-only mode"); ubi->ro_mode = 1; } ubi->leb_size = ubi->peb_size - ubi->leb_start; if (!(ubi->mtd->flags & MTD_WRITEABLE)) { ubi_msg(ubi, "MTD device %d is write-protected, attach in read-only mode", ubi->mtd->index); ubi->ro_mode = 1; } /* * Note, ideally, we have to initialize @ubi->bad_peb_count here. But * unfortunately, MTD does not provide this information. We should loop * over all physical eraseblocks and invoke mtd->block_is_bad() for * each physical eraseblock. So, we leave @ubi->bad_peb_count * uninitialized so far. */ return 0; } /** * autoresize - re-size the volume which has the "auto-resize" flag set. * @ubi: UBI device description object * @vol_id: ID of the volume to re-size * * This function re-sizes the volume marked by the %UBI_VTBL_AUTORESIZE_FLG in * the volume table to the largest possible size. See comments in ubi-header.h * for more description of the flag. Returns zero in case of success and a * negative error code in case of failure. */ static int autoresize(struct ubi_device *ubi, int vol_id) { struct ubi_volume_desc desc; struct ubi_volume *vol = ubi->volumes[vol_id]; int err, old_reserved_pebs = vol->reserved_pebs; if (ubi->ro_mode) { ubi_warn(ubi, "skip auto-resize because of R/O mode"); return 0; } /* * Clear the auto-resize flag in the volume in-memory copy of the * volume table, and 'ubi_resize_volume()' will propagate this change * to the flash. */ ubi->vtbl[vol_id].flags &= ~UBI_VTBL_AUTORESIZE_FLG; if (ubi->avail_pebs == 0) { struct ubi_vtbl_record vtbl_rec; /* * No available PEBs to re-size the volume, clear the flag on * flash and exit. */ vtbl_rec = ubi->vtbl[vol_id]; err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec); if (err) ubi_err(ubi, "cannot clean auto-resize flag for volume %d", vol_id); } else { desc.vol = vol; err = ubi_resize_volume(&desc, old_reserved_pebs + ubi->avail_pebs); if (err) ubi_err(ubi, "cannot auto-resize volume %d", vol_id); } if (err) return err; ubi_msg(ubi, "volume %d (\"%s\") re-sized from %d to %d LEBs", vol_id, vol->name, old_reserved_pebs, vol->reserved_pebs); return 0; } /** * ubi_attach_mtd_dev - attach an MTD device. * @mtd: MTD device description object * @ubi_num: number to assign to the new UBI device * @vid_hdr_offset: VID header offset * @max_beb_per1024: maximum expected number of bad PEB per 1024 PEBs * @disable_fm: whether disable fastmap * @need_resv_pool: whether reserve pebs to fill fm_pool * * This function attaches MTD device @mtd_dev to UBI and assign @ubi_num number * to the newly created UBI device, unless @ubi_num is %UBI_DEV_NUM_AUTO, in * which case this function finds a vacant device number and assigns it * automatically. Returns the new UBI device number in case of success and a * negative error code in case of failure. * * If @disable_fm is true, ubi doesn't create new fastmap even the module param * 'fm_autoconvert' is set, and existed old fastmap will be destroyed after * doing full scanning. * * Note, the invocations of this function has to be serialized by the * @ubi_devices_mutex. */ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, int vid_hdr_offset, int max_beb_per1024, bool disable_fm, bool need_resv_pool) { struct ubi_device *ubi; int i, err; if (max_beb_per1024 < 0 || max_beb_per1024 > MAX_MTD_UBI_BEB_LIMIT) return -EINVAL; if (!max_beb_per1024) max_beb_per1024 = CONFIG_MTD_UBI_BEB_LIMIT; /* * Check if we already have the same MTD device attached. * * Note, this function assumes that UBI devices creations and deletions * are serialized, so it does not take the &ubi_devices_lock. */ for (i = 0; i < UBI_MAX_DEVICES; i++) { ubi = ubi_devices[i]; if (ubi && mtd->index == ubi->mtd->index) { pr_err("ubi: mtd%d is already attached to ubi%d\n", mtd->index, i); return -EEXIST; } } /* * Make sure this MTD device is not emulated on top of an UBI volume * already. Well, generally this recursion works fine, but there are * different problems like the UBI module takes a reference to itself * by attaching (and thus, opening) the emulated MTD device. This * results in inability to unload the module. And in general it makes * no sense to attach emulated MTD devices, so we prohibit this. */ if (mtd->type == MTD_UBIVOLUME) { pr_err("ubi: refuse attaching mtd%d - it is already emulated on top of UBI\n", mtd->index); return -EINVAL; } /* * Both UBI and UBIFS have been designed for SLC NAND and NOR flashes. * MLC NAND is different and needs special care, otherwise UBI or UBIFS * will die soon and you will lose all your data. * Relax this rule if the partition we're attaching to operates in SLC * mode. */ if (mtd->type == MTD_MLCNANDFLASH && !(mtd->flags & MTD_SLC_ON_MLC_EMULATION)) { pr_err("ubi: refuse attaching mtd%d - MLC NAND is not supported\n", mtd->index); return -EINVAL; } /* UBI cannot work on flashes with zero erasesize. */ if (!mtd->erasesize) { pr_err("ubi: refuse attaching mtd%d - zero erasesize flash is not supported\n", mtd->index); return -EINVAL; } if (ubi_num == UBI_DEV_NUM_AUTO) { /* Search for an empty slot in the @ubi_devices array */ for (ubi_num = 0; ubi_num < UBI_MAX_DEVICES; ubi_num++) if (!ubi_devices[ubi_num]) break; if (ubi_num == UBI_MAX_DEVICES) { pr_err("ubi: only %d UBI devices may be created\n", UBI_MAX_DEVICES); return -ENFILE; } } else { if (ubi_num >= UBI_MAX_DEVICES) return -EINVAL; /* Make sure ubi_num is not busy */ if (ubi_devices[ubi_num]) { pr_err("ubi: ubi%i already exists\n", ubi_num); return -EEXIST; } } ubi = kzalloc(sizeof(struct ubi_device), GFP_KERNEL); if (!ubi) return -ENOMEM; device_initialize(&ubi->dev); ubi->dev.release = dev_release; ubi->dev.class = &ubi_class; ubi->dev.groups = ubi_dev_groups; ubi->dev.parent = &mtd->dev; ubi->mtd = mtd; ubi->ubi_num = ubi_num; ubi->vid_hdr_offset = vid_hdr_offset; ubi->autoresize_vol_id = -1; #ifdef CONFIG_MTD_UBI_FASTMAP ubi->fm_pool.used = ubi->fm_pool.size = 0; ubi->fm_wl_pool.used = ubi->fm_wl_pool.size = 0; /* * fm_pool.max_size is 5% of the total number of PEBs but it's also * between UBI_FM_MAX_POOL_SIZE and UBI_FM_MIN_POOL_SIZE. */ ubi->fm_pool.max_size = min(((int)mtd_div_by_eb(ubi->mtd->size, ubi->mtd) / 100) * 5, UBI_FM_MAX_POOL_SIZE); ubi->fm_pool.max_size = max(ubi->fm_pool.max_size, UBI_FM_MIN_POOL_SIZE); ubi->fm_wl_pool.max_size = ubi->fm_pool.max_size / 2; ubi->fm_pool_rsv_cnt = need_resv_pool ? ubi->fm_pool.max_size : 0; ubi->fm_disabled = (!fm_autoconvert || disable_fm) ? 1 : 0; if (fm_debug) ubi_enable_dbg_chk_fastmap(ubi); if (!ubi->fm_disabled && (int)mtd_div_by_eb(ubi->mtd->size, ubi->mtd) <= UBI_FM_MAX_START) { ubi_err(ubi, "More than %i PEBs are needed for fastmap, sorry.", UBI_FM_MAX_START); ubi->fm_disabled = 1; } ubi_msg(ubi, "default fastmap pool size: %d", ubi->fm_pool.max_size); ubi_msg(ubi, "default fastmap WL pool size: %d", ubi->fm_wl_pool.max_size); #else ubi->fm_disabled = 1; #endif mutex_init(&ubi->buf_mutex); mutex_init(&ubi->ckvol_mutex); mutex_init(&ubi->device_mutex); spin_lock_init(&ubi->volumes_lock); init_rwsem(&ubi->fm_protect); init_rwsem(&ubi->fm_eba_sem); ubi_msg(ubi, "attaching mtd%d", mtd->index); err = io_init(ubi, max_beb_per1024); if (err) goto out_free; err = -ENOMEM; ubi->peb_buf = vmalloc(ubi->peb_size); if (!ubi->peb_buf) goto out_free; #ifdef CONFIG_MTD_UBI_FASTMAP ubi->fm_size = ubi_calc_fm_size(ubi); ubi->fm_buf = vzalloc(ubi->fm_size); if (!ubi->fm_buf) goto out_free; #endif err = ubi_attach(ubi, disable_fm ? 1 : 0); if (err) { ubi_err(ubi, "failed to attach mtd%d, error %d", mtd->index, err); goto out_free; } if (ubi->autoresize_vol_id != -1) { err = autoresize(ubi, ubi->autoresize_vol_id); if (err) goto out_detach; } err = uif_init(ubi); if (err) goto out_detach; err = ubi_debugfs_init_dev(ubi); if (err) goto out_uif; ubi->bgt_thread = kthread_create(ubi_thread, ubi, "%s", ubi->bgt_name); if (IS_ERR(ubi->bgt_thread)) { err = PTR_ERR(ubi->bgt_thread); ubi_err(ubi, "cannot spawn \"%s\", error %d", ubi->bgt_name, err); goto out_debugfs; } ubi_msg(ubi, "attached mtd%d (name \"%s\", size %llu MiB)", mtd->index, mtd->name, ubi->flash_size >> 20); ubi_msg(ubi, "PEB size: %d bytes (%d KiB), LEB size: %d bytes", ubi->peb_size, ubi->peb_size >> 10, ubi->leb_size); ubi_msg(ubi, "min./max. I/O unit sizes: %d/%d, sub-page size %d", ubi->min_io_size, ubi->max_write_size, ubi->hdrs_min_io_size); ubi_msg(ubi, "VID header offset: %d (aligned %d), data offset: %d", ubi->vid_hdr_offset, ubi->vid_hdr_aloffset, ubi->leb_start); ubi_msg(ubi, "good PEBs: %d, bad PEBs: %d, corrupted PEBs: %d", ubi->good_peb_count, ubi->bad_peb_count, ubi->corr_peb_count); ubi_msg(ubi, "user volume: %d, internal volumes: %d, max. volumes count: %d", ubi->vol_count - UBI_INT_VOL_COUNT, UBI_INT_VOL_COUNT, ubi->vtbl_slots); ubi_msg(ubi, "max/mean erase counter: %d/%d, WL threshold: %d, image sequence number: %u", ubi->max_ec, ubi->mean_ec, CONFIG_MTD_UBI_WL_THRESHOLD, ubi->image_seq); ubi_msg(ubi, "available PEBs: %d, total reserved PEBs: %d, PEBs reserved for bad PEB handling: %d", ubi->avail_pebs, ubi->rsvd_pebs, ubi->beb_rsvd_pebs); /* * The below lock makes sure we do not race with 'ubi_thread()' which * checks @ubi->thread_enabled. Otherwise we may fail to wake it up. */ spin_lock(&ubi->wl_lock); ubi->thread_enabled = 1; wake_up_process(ubi->bgt_thread); spin_unlock(&ubi->wl_lock); ubi_devices[ubi_num] = ubi; ubi_notify_all(ubi, UBI_VOLUME_ADDED, NULL); return ubi_num; out_debugfs: ubi_debugfs_exit_dev(ubi); out_uif: uif_close(ubi); out_detach: ubi_wl_close(ubi); ubi_free_all_volumes(ubi); vfree(ubi->vtbl); out_free: vfree(ubi->peb_buf); vfree(ubi->fm_buf); put_device(&ubi->dev); return err; } /** * ubi_detach_mtd_dev - detach an MTD device. * @ubi_num: UBI device number to detach from * @anyway: detach MTD even if device reference count is not zero * * This function destroys an UBI device number @ubi_num and detaches the * underlying MTD device. Returns zero in case of success and %-EBUSY if the * UBI device is busy and cannot be destroyed, and %-EINVAL if it does not * exist. * * Note, the invocations of this function has to be serialized by the * @ubi_devices_mutex. */ int ubi_detach_mtd_dev(int ubi_num, int anyway) { struct ubi_device *ubi; if (ubi_num < 0 || ubi_num >= UBI_MAX_DEVICES) return -EINVAL; ubi = ubi_get_device(ubi_num); if (!ubi) return -EINVAL; spin_lock(&ubi_devices_lock); ubi->ref_count -= 1; if (ubi->ref_count) { if (!anyway) { spin_unlock(&ubi_devices_lock); return -EBUSY; } /* This may only happen if there is a bug */ ubi_err(ubi, "%s reference count %d, destroy anyway", ubi->ubi_name, ubi->ref_count); } ubi->is_dead = true; spin_unlock(&ubi_devices_lock); ubi_notify_all(ubi, UBI_VOLUME_SHUTDOWN, NULL); spin_lock(&ubi_devices_lock); put_device(&ubi->dev); ubi_devices[ubi_num] = NULL; spin_unlock(&ubi_devices_lock); ubi_assert(ubi_num == ubi->ubi_num); ubi_notify_all(ubi, UBI_VOLUME_REMOVED, NULL); ubi_msg(ubi, "detaching mtd%d", ubi->mtd->index); #ifdef CONFIG_MTD_UBI_FASTMAP /* If we don't write a new fastmap at detach time we lose all * EC updates that have been made since the last written fastmap. * In case of fastmap debugging we omit the update to simulate an * unclean shutdown. */ if (!ubi_dbg_chk_fastmap(ubi)) ubi_update_fastmap(ubi); #endif /* * Before freeing anything, we have to stop the background thread to * prevent it from doing anything on this device while we are freeing. */ if (ubi->bgt_thread) kthread_stop(ubi->bgt_thread); #ifdef CONFIG_MTD_UBI_FASTMAP cancel_work_sync(&ubi->fm_work); #endif ubi_debugfs_exit_dev(ubi); uif_close(ubi); ubi_wl_close(ubi); ubi_free_internal_volumes(ubi); vfree(ubi->vtbl); vfree(ubi->peb_buf); vfree(ubi->fm_buf); ubi_msg(ubi, "mtd%d is detached", ubi->mtd->index); put_mtd_device(ubi->mtd); put_device(&ubi->dev); return 0; } /** * open_mtd_by_chdev - open an MTD device by its character device node path. * @mtd_dev: MTD character device node path * * This helper function opens an MTD device by its character node device path. * Returns MTD device description object in case of success and a negative * error code in case of failure. */ static struct mtd_info * __init open_mtd_by_chdev(const char *mtd_dev) { int err, minor; struct path path; struct kstat stat; /* Probably this is an MTD character device node path */ err = kern_path(mtd_dev, LOOKUP_FOLLOW, &path); if (err) return ERR_PTR(err); err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT); path_put(&path); if (err) return ERR_PTR(err); /* MTD device number is defined by the major / minor numbers */ if (MAJOR(stat.rdev) != MTD_CHAR_MAJOR || !S_ISCHR(stat.mode)) return ERR_PTR(-EINVAL); minor = MINOR(stat.rdev); if (minor & 1) /* * Just do not think the "/dev/mtdrX" devices support is need, * so do not support them to avoid doing extra work. */ return ERR_PTR(-EINVAL); return get_mtd_device(NULL, minor / 2); } /** * open_mtd_device - open MTD device by name, character device path, or number. * @mtd_dev: name, character device node path, or MTD device device number * * This function tries to open and MTD device described by @mtd_dev string, * which is first treated as ASCII MTD device number, and if it is not true, it * is treated as MTD device name, and if that is also not true, it is treated * as MTD character device node path. Returns MTD device description object in * case of success and a negative error code in case of failure. */ static struct mtd_info * __init open_mtd_device(const char *mtd_dev) { struct mtd_info *mtd; int mtd_num; char *endp; mtd_num = simple_strtoul(mtd_dev, &endp, 0); if (*endp != '\0' || mtd_dev == endp) { /* * This does not look like an ASCII integer, probably this is * MTD device name. */ mtd = get_mtd_device_nm(mtd_dev); if (PTR_ERR(mtd) == -ENODEV) /* Probably this is an MTD character device node path */ mtd = open_mtd_by_chdev(mtd_dev); } else mtd = get_mtd_device(NULL, mtd_num); return mtd; } static void ubi_notify_add(struct mtd_info *mtd) { struct device_node *np = mtd_get_of_node(mtd); int err; if (!of_device_is_compatible(np, "linux,ubi")) return; /* * we are already holding &mtd_table_mutex, but still need * to bump refcount */ err = __get_mtd_device(mtd); if (err) return; /* called while holding mtd_table_mutex */ mutex_lock_nested(&ubi_devices_mutex, SINGLE_DEPTH_NESTING); err = ubi_attach_mtd_dev(mtd, UBI_DEV_NUM_AUTO, 0, 0, false, false); mutex_unlock(&ubi_devices_mutex); if (err < 0) __put_mtd_device(mtd); } static void ubi_notify_remove(struct mtd_info *mtd) { /* do nothing for now */ } static struct mtd_notifier ubi_mtd_notifier = { .add = ubi_notify_add, .remove = ubi_notify_remove, }; static int __init ubi_init_attach(void) { int err, i, k; /* Attach MTD devices */ for (i = 0; i < mtd_devs; i++) { struct mtd_dev_param *p = &mtd_dev_param[i]; struct mtd_info *mtd; cond_resched(); mtd = open_mtd_device(p->name); if (IS_ERR(mtd)) { err = PTR_ERR(mtd); pr_err("UBI error: cannot open mtd %s, error %d\n", p->name, err); /* See comment below re-ubi_is_module(). */ if (ubi_is_module()) goto out_detach; continue; } mutex_lock(&ubi_devices_mutex); err = ubi_attach_mtd_dev(mtd, p->ubi_num, p->vid_hdr_offs, p->max_beb_per1024, p->enable_fm == 0, p->need_resv_pool != 0); mutex_unlock(&ubi_devices_mutex); if (err < 0) { pr_err("UBI error: cannot attach mtd%d\n", mtd->index); put_mtd_device(mtd); /* * Originally UBI stopped initializing on any error. * However, later on it was found out that this * behavior is not very good when UBI is compiled into * the kernel and the MTD devices to attach are passed * through the command line. Indeed, UBI failure * stopped whole boot sequence. * * To fix this, we changed the behavior for the * non-module case, but preserved the old behavior for * the module case, just for compatibility. This is a * little inconsistent, though. */ if (ubi_is_module()) goto out_detach; } } return 0; out_detach: for (k = 0; k < i; k++) if (ubi_devices[k]) { mutex_lock(&ubi_devices_mutex); ubi_detach_mtd_dev(ubi_devices[k]->ubi_num, 1); mutex_unlock(&ubi_devices_mutex); } return err; } #ifndef CONFIG_MTD_UBI_MODULE late_initcall(ubi_init_attach); #endif static int __init ubi_init(void) { int err; /* Ensure that EC and VID headers have correct size */ BUILD_BUG_ON(sizeof(struct ubi_ec_hdr) != 64); BUILD_BUG_ON(sizeof(struct ubi_vid_hdr) != 64); if (mtd_devs > UBI_MAX_DEVICES) { pr_err("UBI error: too many MTD devices, maximum is %d\n", UBI_MAX_DEVICES); return -EINVAL; } /* Create base sysfs directory and sysfs files */ err = class_register(&ubi_class); if (err < 0) return err; err = misc_register(&ubi_ctrl_cdev); if (err) { pr_err("UBI error: cannot register device\n"); goto out; } ubi_wl_entry_slab = kmem_cache_create("ubi_wl_entry_slab", sizeof(struct ubi_wl_entry), 0, 0, NULL); if (!ubi_wl_entry_slab) { err = -ENOMEM; goto out_dev_unreg; } err = ubi_debugfs_init(); if (err) goto out_slab; err = ubiblock_init(); if (err) { pr_err("UBI error: block: cannot initialize, error %d\n", err); /* See comment above re-ubi_is_module(). */ if (ubi_is_module()) goto out_debugfs; } register_mtd_user(&ubi_mtd_notifier); if (ubi_is_module()) { err = ubi_init_attach(); if (err) goto out_mtd_notifier; } return 0; out_mtd_notifier: unregister_mtd_user(&ubi_mtd_notifier); ubiblock_exit(); out_debugfs: ubi_debugfs_exit(); out_slab: kmem_cache_destroy(ubi_wl_entry_slab); out_dev_unreg: misc_deregister(&ubi_ctrl_cdev); out: class_unregister(&ubi_class); pr_err("UBI error: cannot initialize UBI, error %d\n", err); return err; } device_initcall(ubi_init); static void __exit ubi_exit(void) { int i; ubiblock_exit(); unregister_mtd_user(&ubi_mtd_notifier); for (i = 0; i < UBI_MAX_DEVICES; i++) if (ubi_devices[i]) { mutex_lock(&ubi_devices_mutex); ubi_detach_mtd_dev(ubi_devices[i]->ubi_num, 1); mutex_unlock(&ubi_devices_mutex); } ubi_debugfs_exit(); kmem_cache_destroy(ubi_wl_entry_slab); misc_deregister(&ubi_ctrl_cdev); class_unregister(&ubi_class); } module_exit(ubi_exit); /** * bytes_str_to_int - convert a number of bytes string into an integer. * @str: the string to convert * * This function returns positive resulting integer in case of success and a * negative error code in case of failure. */ static int bytes_str_to_int(const char *str) { char *endp; unsigned long result; result = simple_strtoul(str, &endp, 0); if (str == endp || result >= INT_MAX) { pr_err("UBI error: incorrect bytes count: \"%s\"\n", str); return -EINVAL; } switch (*endp) { case 'G': result *= 1024; fallthrough; case 'M': result *= 1024; fallthrough; case 'K': result *= 1024; break; case '\0': break; default: pr_err("UBI error: incorrect bytes count: \"%s\"\n", str); return -EINVAL; } return result; } /** * ubi_mtd_param_parse - parse the 'mtd=' UBI parameter. * @val: the parameter value to parse * @kp: not used * * This function returns zero in case of success and a negative error code in * case of error. */ static int ubi_mtd_param_parse(const char *val, const struct kernel_param *kp) { int i, len; struct mtd_dev_param *p; char buf[MTD_PARAM_LEN_MAX]; char *pbuf = &buf[0]; char *tokens[MTD_PARAM_MAX_COUNT], *token; if (!val) return -EINVAL; if (mtd_devs == UBI_MAX_DEVICES) { pr_err("UBI error: too many parameters, max. is %d\n", UBI_MAX_DEVICES); return -EINVAL; } len = strnlen(val, MTD_PARAM_LEN_MAX); if (len == MTD_PARAM_LEN_MAX) { pr_err("UBI error: parameter \"%s\" is too long, max. is %d\n", val, MTD_PARAM_LEN_MAX); return -EINVAL; } if (len == 0) { pr_warn("UBI warning: empty 'mtd=' parameter - ignored\n"); return 0; } strcpy(buf, val); /* Get rid of the final newline */ if (buf[len - 1] == '\n') buf[len - 1] = '\0'; for (i = 0; i < MTD_PARAM_MAX_COUNT; i++) tokens[i] = strsep(&pbuf, ","); if (pbuf) { pr_err("UBI error: too many arguments at \"%s\"\n", val); return -EINVAL; } p = &mtd_dev_param[mtd_devs]; strcpy(&p->name[0], tokens[0]); token = tokens[1]; if (token) { p->vid_hdr_offs = bytes_str_to_int(token); if (p->vid_hdr_offs < 0) return p->vid_hdr_offs; } token = tokens[2]; if (token) { int err = kstrtoint(token, 10, &p->max_beb_per1024); if (err) { pr_err("UBI error: bad value for max_beb_per1024 parameter: %s\n", token); return -EINVAL; } } token = tokens[3]; if (token) { int err = kstrtoint(token, 10, &p->ubi_num); if (err || p->ubi_num < UBI_DEV_NUM_AUTO) { pr_err("UBI error: bad value for ubi_num parameter: %s\n", token); return -EINVAL; } } else p->ubi_num = UBI_DEV_NUM_AUTO; token = tokens[4]; if (token) { int err = kstrtoint(token, 10, &p->enable_fm); if (err) { pr_err("UBI error: bad value for enable_fm parameter: %s\n", token); return -EINVAL; } } else p->enable_fm = 0; token = tokens[5]; if (token) { int err = kstrtoint(token, 10, &p->need_resv_pool); if (err) { pr_err("UBI error: bad value for need_resv_pool parameter: %s\n", token); return -EINVAL; } } else p->need_resv_pool = 0; mtd_devs += 1; return 0; } module_param_call(mtd, ubi_mtd_param_parse, NULL, NULL, 0400); MODULE_PARM_DESC(mtd, "MTD devices to attach. Parameter format: mtd=<name|num|path>[,<vid_hdr_offs>[,max_beb_per1024[,ubi_num]]].\n" "Multiple \"mtd\" parameters may be specified.\n" "MTD devices may be specified by their number, name, or path to the MTD character device node.\n" "Optional \"vid_hdr_offs\" parameter specifies UBI VID header position to be used by UBI. (default value if 0)\n" "Optional \"max_beb_per1024\" parameter specifies the maximum expected bad eraseblock per 1024 eraseblocks. (default value (" __stringify(CONFIG_MTD_UBI_BEB_LIMIT) ") if 0)\n" "Optional \"ubi_num\" parameter specifies UBI device number which have to be assigned to the newly created UBI device (assigned automatically by default)\n" "Optional \"enable_fm\" parameter determines whether to enable fastmap during attach. If the value is non-zero, fastmap is enabled. Default value is 0.\n" "Optional \"need_resv_pool\" parameter determines whether to reserve pool->max_size pebs during attach. If the value is non-zero, peb reservation is enabled. Default value is 0.\n" "\n" "Example 1: mtd=/dev/mtd0 - attach MTD device /dev/mtd0.\n" "Example 2: mtd=content,1984 mtd=4 - attach MTD device with name \"content\" using VID header offset 1984, and MTD device number 4 with default VID header offset.\n" "Example 3: mtd=/dev/mtd1,0,25 - attach MTD device /dev/mtd1 using default VID header offset and reserve 25*nand_size_in_blocks/1024 erase blocks for bad block handling.\n" "Example 4: mtd=/dev/mtd1,0,0,5 - attach MTD device /dev/mtd1 to UBI 5 and using default values for the other fields.\n" "example 5: mtd=1,0,0,5 mtd=2,0,0,6,1 - attach MTD device /dev/mtd1 to UBI 5 and disable fastmap; attach MTD device /dev/mtd2 to UBI 6 and enable fastmap.(only works when fastmap is enabled and fm_autoconvert=Y).\n" "\t(e.g. if the NAND *chipset* has 4096 PEB, 100 will be reserved for this UBI device)."); #ifdef CONFIG_MTD_UBI_FASTMAP module_param(fm_autoconvert, bool, 0644); MODULE_PARM_DESC(fm_autoconvert, "Set this parameter to enable fastmap automatically on images without a fastmap."); module_param(fm_debug, bool, 0); MODULE_PARM_DESC(fm_debug, "Set this parameter to enable fastmap debugging by default. Warning, this will make fastmap slow!"); #endif MODULE_VERSION(__stringify(UBI_VERSION)); MODULE_DESCRIPTION("UBI - Unsorted Block Images"); MODULE_AUTHOR("Artem Bityutskiy"); MODULE_LICENSE("GPL");
13 15 15 15 15 15 7 7 19 6 2 1 1 1 2 1 1 1 2 1 1 1 1 1 19 4 4 4 4 4 4 4 13 13 13 13 13 4 4 9 9 9 9 9 9 9 9 9 9 13 13 13 4 2 9 9 9 9 9 12 8 1 13 13 13 13 13 13 13 12 3 9 9 12 8 8 4 4 7 7 7 9 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 // SPDX-License-Identifier: GPL-2.0-only /* * super.c * * PURPOSE * Super block routines for the OSTA-UDF(tm) filesystem. * * DESCRIPTION * OSTA-UDF(tm) = Optical Storage Technology Association * Universal Disk Format. * * This code is based on version 2.00 of the UDF specification, * and revision 3 of the ECMA 167 standard [equivalent to ISO 13346]. * http://www.osta.org/ * https://www.ecma.ch/ * https://www.iso.org/ * * COPYRIGHT * (C) 1998 Dave Boynton * (C) 1998-2004 Ben Fennema * (C) 2000 Stelias Computing Inc * * HISTORY * * 09/24/98 dgb changed to allow compiling outside of kernel, and * added some debugging. * 10/01/98 dgb updated to allow (some) possibility of compiling w/2.0.34 * 10/16/98 attempting some multi-session support * 10/17/98 added freespace count for "df" * 11/11/98 gr added novrs option * 11/26/98 dgb added fileset,anchor mount options * 12/06/98 blf really hosed things royally. vat/sparing support. sequenced * vol descs. rewrote option handling based on isofs * 12/20/98 find the free space bitmap (if it exists) */ #include "udfdecl.h" #include <linux/blkdev.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/stat.h> #include <linux/cdrom.h> #include <linux/nls.h> #include <linux/vfs.h> #include <linux/vmalloc.h> #include <linux/errno.h> #include <linux/seq_file.h> #include <linux/bitmap.h> #include <linux/crc-itu-t.h> #include <linux/log2.h> #include <asm/byteorder.h> #include <linux/iversion.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include "udf_sb.h" #include "udf_i.h" #include <linux/init.h> #include <linux/uaccess.h> enum { VDS_POS_PRIMARY_VOL_DESC, VDS_POS_UNALLOC_SPACE_DESC, VDS_POS_LOGICAL_VOL_DESC, VDS_POS_IMP_USE_VOL_DESC, VDS_POS_LENGTH }; #define VSD_FIRST_SECTOR_OFFSET 32768 #define VSD_MAX_SECTOR_OFFSET 0x800000 /* * Maximum number of Terminating Descriptor / Logical Volume Integrity * Descriptor redirections. The chosen numbers are arbitrary - just that we * hopefully don't limit any real use of rewritten inode on write-once media * but avoid looping for too long on corrupted media. */ #define UDF_MAX_TD_NESTING 64 #define UDF_MAX_LVID_NESTING 1000 enum { UDF_MAX_LINKS = 0xffff }; /* * We limit filesize to 4TB. This is arbitrary as the on-disk format supports * more but because the file space is described by a linked list of extents, * each of which can have at most 1GB, the creation and handling of extents * gets unusably slow beyond certain point... */ #define UDF_MAX_FILESIZE (1ULL << 42) /* These are the "meat" - everything else is stuffing */ static int udf_fill_super(struct super_block *sb, struct fs_context *fc); static void udf_put_super(struct super_block *); static int udf_sync_fs(struct super_block *, int); static void udf_load_logicalvolint(struct super_block *, struct kernel_extent_ad); static void udf_open_lvid(struct super_block *); static void udf_close_lvid(struct super_block *); static unsigned int udf_count_free(struct super_block *); static int udf_statfs(struct dentry *, struct kstatfs *); static int udf_show_options(struct seq_file *, struct dentry *); static int udf_init_fs_context(struct fs_context *fc); static int udf_parse_param(struct fs_context *fc, struct fs_parameter *param); static int udf_reconfigure(struct fs_context *fc); static void udf_free_fc(struct fs_context *fc); static const struct fs_parameter_spec udf_param_spec[]; struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct super_block *sb) { struct logicalVolIntegrityDesc *lvid; unsigned int partnum; unsigned int offset; if (!UDF_SB(sb)->s_lvid_bh) return NULL; lvid = (struct logicalVolIntegrityDesc *)UDF_SB(sb)->s_lvid_bh->b_data; partnum = le32_to_cpu(lvid->numOfPartitions); /* The offset is to skip freeSpaceTable and sizeTable arrays */ offset = partnum * 2 * sizeof(uint32_t); return (struct logicalVolIntegrityDescImpUse *) (((uint8_t *)(lvid + 1)) + offset); } /* UDF filesystem type */ static int udf_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, udf_fill_super); } static const struct fs_context_operations udf_context_ops = { .parse_param = udf_parse_param, .get_tree = udf_get_tree, .reconfigure = udf_reconfigure, .free = udf_free_fc, }; static struct file_system_type udf_fstype = { .owner = THIS_MODULE, .name = "udf", .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, .init_fs_context = udf_init_fs_context, .parameters = udf_param_spec, }; MODULE_ALIAS_FS("udf"); static struct kmem_cache *udf_inode_cachep; static struct inode *udf_alloc_inode(struct super_block *sb) { struct udf_inode_info *ei; ei = alloc_inode_sb(sb, udf_inode_cachep, GFP_KERNEL); if (!ei) return NULL; ei->i_unique = 0; ei->i_lenExtents = 0; ei->i_lenStreams = 0; ei->i_next_alloc_block = 0; ei->i_next_alloc_goal = 0; ei->i_strat4096 = 0; ei->i_streamdir = 0; ei->i_hidden = 0; init_rwsem(&ei->i_data_sem); ei->cached_extent.lstart = -1; spin_lock_init(&ei->i_extent_cache_lock); inode_set_iversion(&ei->vfs_inode, 1); return &ei->vfs_inode; } static void udf_free_in_core_inode(struct inode *inode) { kmem_cache_free(udf_inode_cachep, UDF_I(inode)); } static void init_once(void *foo) { struct udf_inode_info *ei = foo; ei->i_data = NULL; inode_init_once(&ei->vfs_inode); } static int __init init_inodecache(void) { udf_inode_cachep = kmem_cache_create("udf_inode_cache", sizeof(struct udf_inode_info), 0, (SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT), init_once); if (!udf_inode_cachep) return -ENOMEM; return 0; } static void destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(udf_inode_cachep); } /* Superblock operations */ static const struct super_operations udf_sb_ops = { .alloc_inode = udf_alloc_inode, .free_inode = udf_free_in_core_inode, .write_inode = udf_write_inode, .evict_inode = udf_evict_inode, .put_super = udf_put_super, .sync_fs = udf_sync_fs, .statfs = udf_statfs, .show_options = udf_show_options, }; struct udf_options { unsigned int blocksize; unsigned int session; unsigned int lastblock; unsigned int anchor; unsigned int flags; umode_t umask; kgid_t gid; kuid_t uid; umode_t fmode; umode_t dmode; struct nls_table *nls_map; }; /* * UDF has historically preserved prior mount options across * a remount, so copy those here if remounting, otherwise set * initial mount defaults. */ static void udf_init_options(struct fs_context *fc, struct udf_options *uopt) { if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { struct super_block *sb = fc->root->d_sb; struct udf_sb_info *sbi = UDF_SB(sb); uopt->flags = sbi->s_flags; uopt->uid = sbi->s_uid; uopt->gid = sbi->s_gid; uopt->umask = sbi->s_umask; uopt->fmode = sbi->s_fmode; uopt->dmode = sbi->s_dmode; uopt->nls_map = NULL; } else { uopt->flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT); /* * By default we'll use overflow[ug]id when UDF * inode [ug]id == -1 */ uopt->uid = make_kuid(current_user_ns(), overflowuid); uopt->gid = make_kgid(current_user_ns(), overflowgid); uopt->umask = 0; uopt->fmode = UDF_INVALID_MODE; uopt->dmode = UDF_INVALID_MODE; uopt->nls_map = NULL; uopt->session = 0xFFFFFFFF; } } static int udf_init_fs_context(struct fs_context *fc) { struct udf_options *uopt; uopt = kzalloc(sizeof(*uopt), GFP_KERNEL); if (!uopt) return -ENOMEM; udf_init_options(fc, uopt); fc->fs_private = uopt; fc->ops = &udf_context_ops; return 0; } static void udf_free_fc(struct fs_context *fc) { struct udf_options *uopt = fc->fs_private; unload_nls(uopt->nls_map); kfree(fc->fs_private); } static int __init init_udf_fs(void) { int err; err = init_inodecache(); if (err) goto out1; err = register_filesystem(&udf_fstype); if (err) goto out; return 0; out: destroy_inodecache(); out1: return err; } static void __exit exit_udf_fs(void) { unregister_filesystem(&udf_fstype); destroy_inodecache(); } static int udf_sb_alloc_partition_maps(struct super_block *sb, u32 count) { struct udf_sb_info *sbi = UDF_SB(sb); sbi->s_partmaps = kcalloc(count, sizeof(*sbi->s_partmaps), GFP_KERNEL); if (!sbi->s_partmaps) { sbi->s_partitions = 0; return -ENOMEM; } sbi->s_partitions = count; return 0; } static void udf_sb_free_bitmap(struct udf_bitmap *bitmap) { int i; int nr_groups = bitmap->s_nr_groups; for (i = 0; i < nr_groups; i++) if (!IS_ERR_OR_NULL(bitmap->s_block_bitmap[i])) brelse(bitmap->s_block_bitmap[i]); kvfree(bitmap); } static void udf_free_partition(struct udf_part_map *map) { int i; struct udf_meta_data *mdata; if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) iput(map->s_uspace.s_table); if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) udf_sb_free_bitmap(map->s_uspace.s_bitmap); if (map->s_partition_type == UDF_SPARABLE_MAP15) for (i = 0; i < 4; i++) brelse(map->s_type_specific.s_sparing.s_spar_map[i]); else if (map->s_partition_type == UDF_METADATA_MAP25) { mdata = &map->s_type_specific.s_metadata; iput(mdata->s_metadata_fe); mdata->s_metadata_fe = NULL; iput(mdata->s_mirror_fe); mdata->s_mirror_fe = NULL; iput(mdata->s_bitmap_fe); mdata->s_bitmap_fe = NULL; } } static void udf_sb_free_partitions(struct super_block *sb) { struct udf_sb_info *sbi = UDF_SB(sb); int i; if (!sbi->s_partmaps) return; for (i = 0; i < sbi->s_partitions; i++) udf_free_partition(&sbi->s_partmaps[i]); kfree(sbi->s_partmaps); sbi->s_partmaps = NULL; } static int udf_show_options(struct seq_file *seq, struct dentry *root) { struct super_block *sb = root->d_sb; struct udf_sb_info *sbi = UDF_SB(sb); if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT)) seq_puts(seq, ",nostrict"); if (UDF_QUERY_FLAG(sb, UDF_FLAG_BLOCKSIZE_SET)) seq_printf(seq, ",bs=%lu", sb->s_blocksize); if (UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE)) seq_puts(seq, ",unhide"); if (UDF_QUERY_FLAG(sb, UDF_FLAG_UNDELETE)) seq_puts(seq, ",undelete"); if (!UDF_QUERY_FLAG(sb, UDF_FLAG_USE_AD_IN_ICB)) seq_puts(seq, ",noadinicb"); if (UDF_QUERY_FLAG(sb, UDF_FLAG_USE_SHORT_AD)) seq_puts(seq, ",shortad"); if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_FORGET)) seq_puts(seq, ",uid=forget"); if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_FORGET)) seq_puts(seq, ",gid=forget"); if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_SET)) seq_printf(seq, ",uid=%u", from_kuid(&init_user_ns, sbi->s_uid)); if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET)) seq_printf(seq, ",gid=%u", from_kgid(&init_user_ns, sbi->s_gid)); if (sbi->s_umask != 0) seq_printf(seq, ",umask=%ho", sbi->s_umask); if (sbi->s_fmode != UDF_INVALID_MODE) seq_printf(seq, ",mode=%ho", sbi->s_fmode); if (sbi->s_dmode != UDF_INVALID_MODE) seq_printf(seq, ",dmode=%ho", sbi->s_dmode); if (UDF_QUERY_FLAG(sb, UDF_FLAG_SESSION_SET)) seq_printf(seq, ",session=%d", sbi->s_session); if (UDF_QUERY_FLAG(sb, UDF_FLAG_LASTBLOCK_SET)) seq_printf(seq, ",lastblock=%u", sbi->s_last_block); if (sbi->s_anchor != 0) seq_printf(seq, ",anchor=%u", sbi->s_anchor); if (sbi->s_nls_map) seq_printf(seq, ",iocharset=%s", sbi->s_nls_map->charset); else seq_puts(seq, ",iocharset=utf8"); return 0; } /* * udf_parse_param * * PURPOSE * Parse mount options. * * DESCRIPTION * The following mount options are supported: * * gid= Set the default group. * umask= Set the default umask. * mode= Set the default file permissions. * dmode= Set the default directory permissions. * uid= Set the default user. * bs= Set the block size. * unhide Show otherwise hidden files. * undelete Show deleted files in lists. * adinicb Embed data in the inode (default) * noadinicb Don't embed data in the inode * shortad Use short ad's * longad Use long ad's (default) * nostrict Unset strict conformance * iocharset= Set the NLS character set * * The remaining are for debugging and disaster recovery: * * novrs Skip volume sequence recognition * * The following expect a offset from 0. * * session= Set the CDROM session (default= last session) * anchor= Override standard anchor location. (default= 256) * volume= Override the VolumeDesc location. (unused) * partition= Override the PartitionDesc location. (unused) * lastblock= Set the last block of the filesystem/ * * The following expect a offset from the partition root. * * fileset= Override the fileset block location. (unused) * rootdir= Override the root directory location. (unused) * WARNING: overriding the rootdir to a non-directory may * yield highly unpredictable results. * * PRE-CONDITIONS * fc fs_context with pointer to mount options variable. * param Pointer to fs_parameter being parsed. * * POST-CONDITIONS * <return> 0 Mount options parsed okay. * <return> errno Error parsing mount options. * * HISTORY * July 1, 1997 - Andrew E. Mileski * Written, tested, and released. */ enum { Opt_novrs, Opt_nostrict, Opt_bs, Opt_unhide, Opt_undelete, Opt_noadinicb, Opt_adinicb, Opt_shortad, Opt_longad, Opt_gid, Opt_uid, Opt_umask, Opt_session, Opt_lastblock, Opt_anchor, Opt_volume, Opt_partition, Opt_fileset, Opt_rootdir, Opt_utf8, Opt_iocharset, Opt_err, Opt_fmode, Opt_dmode }; static const struct fs_parameter_spec udf_param_spec[] = { fsparam_flag ("novrs", Opt_novrs), fsparam_flag ("nostrict", Opt_nostrict), fsparam_u32 ("bs", Opt_bs), fsparam_flag ("unhide", Opt_unhide), fsparam_flag ("undelete", Opt_undelete), fsparam_flag_no ("adinicb", Opt_adinicb), fsparam_flag ("shortad", Opt_shortad), fsparam_flag ("longad", Opt_longad), fsparam_string ("gid", Opt_gid), fsparam_string ("uid", Opt_uid), fsparam_u32 ("umask", Opt_umask), fsparam_u32 ("session", Opt_session), fsparam_u32 ("lastblock", Opt_lastblock), fsparam_u32 ("anchor", Opt_anchor), fsparam_u32 ("volume", Opt_volume), fsparam_u32 ("partition", Opt_partition), fsparam_u32 ("fileset", Opt_fileset), fsparam_u32 ("rootdir", Opt_rootdir), fsparam_flag ("utf8", Opt_utf8), fsparam_string ("iocharset", Opt_iocharset), fsparam_u32 ("mode", Opt_fmode), fsparam_u32 ("dmode", Opt_dmode), {} }; static int udf_parse_param(struct fs_context *fc, struct fs_parameter *param) { unsigned int uv; unsigned int n; struct udf_options *uopt = fc->fs_private; struct fs_parse_result result; int token; bool remount = (fc->purpose & FS_CONTEXT_FOR_RECONFIGURE); token = fs_parse(fc, udf_param_spec, param, &result); if (token < 0) return token; switch (token) { case Opt_novrs: uopt->flags |= (1 << UDF_FLAG_NOVRS); break; case Opt_bs: n = result.uint_32; if (n != 512 && n != 1024 && n != 2048 && n != 4096) return -EINVAL; uopt->blocksize = n; uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET); break; case Opt_unhide: uopt->flags |= (1 << UDF_FLAG_UNHIDE); break; case Opt_undelete: uopt->flags |= (1 << UDF_FLAG_UNDELETE); break; case Opt_adinicb: if (result.negated) uopt->flags &= ~(1 << UDF_FLAG_USE_AD_IN_ICB); else uopt->flags |= (1 << UDF_FLAG_USE_AD_IN_ICB); break; case Opt_shortad: uopt->flags |= (1 << UDF_FLAG_USE_SHORT_AD); break; case Opt_longad: uopt->flags &= ~(1 << UDF_FLAG_USE_SHORT_AD); break; case Opt_gid: if (kstrtoint(param->string, 10, &uv) == 0) { kgid_t gid = make_kgid(current_user_ns(), uv); if (!gid_valid(gid)) return -EINVAL; uopt->gid = gid; uopt->flags |= (1 << UDF_FLAG_GID_SET); } else if (!strcmp(param->string, "forget")) { uopt->flags |= (1 << UDF_FLAG_GID_FORGET); } else if (!strcmp(param->string, "ignore")) { /* this option is superseded by gid=<number> */ ; } else { return -EINVAL; } break; case Opt_uid: if (kstrtoint(param->string, 10, &uv) == 0) { kuid_t uid = make_kuid(current_user_ns(), uv); if (!uid_valid(uid)) return -EINVAL; uopt->uid = uid; uopt->flags |= (1 << UDF_FLAG_UID_SET); } else if (!strcmp(param->string, "forget")) { uopt->flags |= (1 << UDF_FLAG_UID_FORGET); } else if (!strcmp(param->string, "ignore")) { /* this option is superseded by uid=<number> */ ; } else { return -EINVAL; } break; case Opt_umask: uopt->umask = result.uint_32; break; case Opt_nostrict: uopt->flags &= ~(1 << UDF_FLAG_STRICT); break; case Opt_session: uopt->session = result.uint_32; if (!remount) uopt->flags |= (1 << UDF_FLAG_SESSION_SET); break; case Opt_lastblock: uopt->lastblock = result.uint_32; if (!remount) uopt->flags |= (1 << UDF_FLAG_LASTBLOCK_SET); break; case Opt_anchor: uopt->anchor = result.uint_32; break; case Opt_volume: case Opt_partition: case Opt_fileset: case Opt_rootdir: /* Ignored (never implemented properly) */ break; case Opt_utf8: if (!remount) { unload_nls(uopt->nls_map); uopt->nls_map = NULL; } break; case Opt_iocharset: if (!remount) { unload_nls(uopt->nls_map); uopt->nls_map = NULL; } /* When nls_map is not loaded then UTF-8 is used */ if (!remount && strcmp(param->string, "utf8") != 0) { uopt->nls_map = load_nls(param->string); if (!uopt->nls_map) { errorf(fc, "iocharset %s not found", param->string); return -EINVAL; } } break; case Opt_fmode: uopt->fmode = result.uint_32 & 0777; break; case Opt_dmode: uopt->dmode = result.uint_32 & 0777; break; default: return -EINVAL; } return 0; } static int udf_reconfigure(struct fs_context *fc) { struct udf_options *uopt = fc->fs_private; struct super_block *sb = fc->root->d_sb; struct udf_sb_info *sbi = UDF_SB(sb); int readonly = fc->sb_flags & SB_RDONLY; int error = 0; if (!readonly && UDF_QUERY_FLAG(sb, UDF_FLAG_RW_INCOMPAT)) return -EACCES; sync_filesystem(sb); write_lock(&sbi->s_cred_lock); sbi->s_flags = uopt->flags; sbi->s_uid = uopt->uid; sbi->s_gid = uopt->gid; sbi->s_umask = uopt->umask; sbi->s_fmode = uopt->fmode; sbi->s_dmode = uopt->dmode; write_unlock(&sbi->s_cred_lock); if (readonly == sb_rdonly(sb)) goto out_unlock; if (readonly) udf_close_lvid(sb); else udf_open_lvid(sb); out_unlock: return error; } /* * Check VSD descriptor. Returns -1 in case we are at the end of volume * recognition area, 0 if the descriptor is valid but non-interesting, 1 if * we found one of NSR descriptors we are looking for. */ static int identify_vsd(const struct volStructDesc *vsd) { int ret = 0; if (!memcmp(vsd->stdIdent, VSD_STD_ID_CD001, VSD_STD_ID_LEN)) { switch (vsd->structType) { case 0: udf_debug("ISO9660 Boot Record found\n"); break; case 1: udf_debug("ISO9660 Primary Volume Descriptor found\n"); break; case 2: udf_debug("ISO9660 Supplementary Volume Descriptor found\n"); break; case 3: udf_debug("ISO9660 Volume Partition Descriptor found\n"); break; case 255: udf_debug("ISO9660 Volume Descriptor Set Terminator found\n"); break; default: udf_debug("ISO9660 VRS (%u) found\n", vsd->structType); break; } } else if (!memcmp(vsd->stdIdent, VSD_STD_ID_BEA01, VSD_STD_ID_LEN)) ; /* ret = 0 */ else if (!memcmp(vsd->stdIdent, VSD_STD_ID_NSR02, VSD_STD_ID_LEN)) ret = 1; else if (!memcmp(vsd->stdIdent, VSD_STD_ID_NSR03, VSD_STD_ID_LEN)) ret = 1; else if (!memcmp(vsd->stdIdent, VSD_STD_ID_BOOT2, VSD_STD_ID_LEN)) ; /* ret = 0 */ else if (!memcmp(vsd->stdIdent, VSD_STD_ID_CDW02, VSD_STD_ID_LEN)) ; /* ret = 0 */ else { /* TEA01 or invalid id : end of volume recognition area */ ret = -1; } return ret; } /* * Check Volume Structure Descriptors (ECMA 167 2/9.1) * We also check any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) * @return 1 if NSR02 or NSR03 found, * -1 if first sector read error, 0 otherwise */ static int udf_check_vsd(struct super_block *sb) { struct volStructDesc *vsd = NULL; loff_t sector = VSD_FIRST_SECTOR_OFFSET; int sectorsize; struct buffer_head *bh = NULL; int nsr = 0; struct udf_sb_info *sbi; loff_t session_offset; sbi = UDF_SB(sb); if (sb->s_blocksize < sizeof(struct volStructDesc)) sectorsize = sizeof(struct volStructDesc); else sectorsize = sb->s_blocksize; session_offset = (loff_t)sbi->s_session << sb->s_blocksize_bits; sector += session_offset; udf_debug("Starting at sector %u (%lu byte sectors)\n", (unsigned int)(sector >> sb->s_blocksize_bits), sb->s_blocksize); /* Process the sequence (if applicable). The hard limit on the sector * offset is arbitrary, hopefully large enough so that all valid UDF * filesystems will be recognised. There is no mention of an upper * bound to the size of the volume recognition area in the standard. * The limit will prevent the code to read all the sectors of a * specially crafted image (like a bluray disc full of CD001 sectors), * potentially causing minutes or even hours of uninterruptible I/O * activity. This actually happened with uninitialised SSD partitions * (all 0xFF) before the check for the limit and all valid IDs were * added */ for (; !nsr && sector < VSD_MAX_SECTOR_OFFSET; sector += sectorsize) { /* Read a block */ bh = sb_bread(sb, sector >> sb->s_blocksize_bits); if (!bh) break; vsd = (struct volStructDesc *)(bh->b_data + (sector & (sb->s_blocksize - 1))); nsr = identify_vsd(vsd); /* Found NSR or end? */ if (nsr) { brelse(bh); break; } /* * Special handling for improperly formatted VRS (e.g., Win10) * where components are separated by 2048 bytes even though * sectors are 4K */ if (sb->s_blocksize == 4096) { nsr = identify_vsd(vsd + 1); /* Ignore unknown IDs... */ if (nsr < 0) nsr = 0; } brelse(bh); } if (nsr > 0) return 1; else if (!bh && sector - session_offset == VSD_FIRST_SECTOR_OFFSET) return -1; else return 0; } static int udf_verify_domain_identifier(struct super_block *sb, struct regid *ident, char *dname) { struct domainIdentSuffix *suffix; if (memcmp(ident->ident, UDF_ID_COMPLIANT, strlen(UDF_ID_COMPLIANT))) { udf_warn(sb, "Not OSTA UDF compliant %s descriptor.\n", dname); goto force_ro; } if (ident->flags & ENTITYID_FLAGS_DIRTY) { udf_warn(sb, "Possibly not OSTA UDF compliant %s descriptor.\n", dname); goto force_ro; } suffix = (struct domainIdentSuffix *)ident->identSuffix; if ((suffix->domainFlags & DOMAIN_FLAGS_HARD_WRITE_PROTECT) || (suffix->domainFlags & DOMAIN_FLAGS_SOFT_WRITE_PROTECT)) { if (!sb_rdonly(sb)) { udf_warn(sb, "Descriptor for %s marked write protected." " Forcing read only mount.\n", dname); } goto force_ro; } return 0; force_ro: if (!sb_rdonly(sb)) return -EACCES; UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT); return 0; } static int udf_load_fileset(struct super_block *sb, struct fileSetDesc *fset, struct kernel_lb_addr *root) { int ret; ret = udf_verify_domain_identifier(sb, &fset->domainIdent, "file set"); if (ret < 0) return ret; *root = lelb_to_cpu(fset->rootDirectoryICB.extLocation); UDF_SB(sb)->s_serial_number = le16_to_cpu(fset->descTag.tagSerialNum); udf_debug("Rootdir at block=%u, partition=%u\n", root->logicalBlockNum, root->partitionReferenceNum); return 0; } static int udf_find_fileset(struct super_block *sb, struct kernel_lb_addr *fileset, struct kernel_lb_addr *root) { struct buffer_head *bh; uint16_t ident; int ret; if (fileset->logicalBlockNum == 0xFFFFFFFF && fileset->partitionReferenceNum == 0xFFFF) return -EINVAL; bh = udf_read_ptagged(sb, fileset, 0, &ident); if (!bh) return -EIO; if (ident != TAG_IDENT_FSD) { brelse(bh); return -EINVAL; } udf_debug("Fileset at block=%u, partition=%u\n", fileset->logicalBlockNum, fileset->partitionReferenceNum); UDF_SB(sb)->s_partition = fileset->partitionReferenceNum; ret = udf_load_fileset(sb, (struct fileSetDesc *)bh->b_data, root); brelse(bh); return ret; } /* * Load primary Volume Descriptor Sequence * * Return <0 on error, 0 on success. -EAGAIN is special meaning next sequence * should be tried. */ static int udf_load_pvoldesc(struct super_block *sb, sector_t block) { struct primaryVolDesc *pvoldesc; uint8_t *outstr; struct buffer_head *bh; uint16_t ident; int ret; struct timestamp *ts; outstr = kzalloc(128, GFP_KERNEL); if (!outstr) return -ENOMEM; bh = udf_read_tagged(sb, block, block, &ident); if (!bh) { ret = -EAGAIN; goto out2; } if (ident != TAG_IDENT_PVD) { ret = -EIO; goto out_bh; } pvoldesc = (struct primaryVolDesc *)bh->b_data; udf_disk_stamp_to_time(&UDF_SB(sb)->s_record_time, pvoldesc->recordingDateAndTime); ts = &pvoldesc->recordingDateAndTime; udf_debug("recording time %04u/%02u/%02u %02u:%02u (%x)\n", le16_to_cpu(ts->year), ts->month, ts->day, ts->hour, ts->minute, le16_to_cpu(ts->typeAndTimezone)); ret = udf_dstrCS0toChar(sb, outstr, 31, pvoldesc->volIdent, 32); if (ret < 0) { strscpy_pad(UDF_SB(sb)->s_volume_ident, "InvalidName"); pr_warn("incorrect volume identification, setting to " "'InvalidName'\n"); } else { strscpy_pad(UDF_SB(sb)->s_volume_ident, outstr); } udf_debug("volIdent[] = '%s'\n", UDF_SB(sb)->s_volume_ident); ret = udf_dstrCS0toChar(sb, outstr, 127, pvoldesc->volSetIdent, 128); if (ret < 0) { ret = 0; goto out_bh; } outstr[ret] = 0; udf_debug("volSetIdent[] = '%s'\n", outstr); ret = 0; out_bh: brelse(bh); out2: kfree(outstr); return ret; } struct inode *udf_find_metadata_inode_efe(struct super_block *sb, u32 meta_file_loc, u32 partition_ref) { struct kernel_lb_addr addr; struct inode *metadata_fe; addr.logicalBlockNum = meta_file_loc; addr.partitionReferenceNum = partition_ref; metadata_fe = udf_iget_special(sb, &addr); if (IS_ERR(metadata_fe)) { udf_warn(sb, "metadata inode efe not found\n"); return metadata_fe; } if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) { udf_warn(sb, "metadata inode efe does not have short allocation descriptors!\n"); iput(metadata_fe); return ERR_PTR(-EIO); } return metadata_fe; } static int udf_load_metadata_files(struct super_block *sb, int partition, int type1_index) { struct udf_sb_info *sbi = UDF_SB(sb); struct udf_part_map *map; struct udf_meta_data *mdata; struct kernel_lb_addr addr; struct inode *fe; map = &sbi->s_partmaps[partition]; mdata = &map->s_type_specific.s_metadata; mdata->s_phys_partition_ref = type1_index; /* metadata address */ udf_debug("Metadata file location: block = %u part = %u\n", mdata->s_meta_file_loc, mdata->s_phys_partition_ref); fe = udf_find_metadata_inode_efe(sb, mdata->s_meta_file_loc, mdata->s_phys_partition_ref); if (IS_ERR(fe)) { /* mirror file entry */ udf_debug("Mirror metadata file location: block = %u part = %u\n", mdata->s_mirror_file_loc, mdata->s_phys_partition_ref); fe = udf_find_metadata_inode_efe(sb, mdata->s_mirror_file_loc, mdata->s_phys_partition_ref); if (IS_ERR(fe)) { udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n"); return PTR_ERR(fe); } mdata->s_mirror_fe = fe; } else mdata->s_metadata_fe = fe; /* * bitmap file entry * Note: * Load only if bitmap file location differs from 0xFFFFFFFF (DCN-5102) */ if (mdata->s_bitmap_file_loc != 0xFFFFFFFF) { addr.logicalBlockNum = mdata->s_bitmap_file_loc; addr.partitionReferenceNum = mdata->s_phys_partition_ref; udf_debug("Bitmap file location: block = %u part = %u\n", addr.logicalBlockNum, addr.partitionReferenceNum); fe = udf_iget_special(sb, &addr); if (IS_ERR(fe)) { if (sb_rdonly(sb)) udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n"); else { udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n"); return PTR_ERR(fe); } } else mdata->s_bitmap_fe = fe; } udf_debug("udf_load_metadata_files Ok\n"); return 0; } int udf_compute_nr_groups(struct super_block *sb, u32 partition) { struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition]; return DIV_ROUND_UP(map->s_partition_len + (sizeof(struct spaceBitmapDesc) << 3), sb->s_blocksize * 8); } static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index) { struct udf_bitmap *bitmap; int nr_groups = udf_compute_nr_groups(sb, index); bitmap = kvzalloc(struct_size(bitmap, s_block_bitmap, nr_groups), GFP_KERNEL); if (!bitmap) return NULL; bitmap->s_nr_groups = nr_groups; return bitmap; } static int check_partition_desc(struct super_block *sb, struct partitionDesc *p, struct udf_part_map *map) { bool umap, utable, fmap, ftable; struct partitionHeaderDesc *phd; switch (le32_to_cpu(p->accessType)) { case PD_ACCESS_TYPE_READ_ONLY: case PD_ACCESS_TYPE_WRITE_ONCE: case PD_ACCESS_TYPE_NONE: goto force_ro; } /* No Partition Header Descriptor? */ if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) && strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03)) goto force_ro; phd = (struct partitionHeaderDesc *)p->partitionContentsUse; utable = phd->unallocSpaceTable.extLength; umap = phd->unallocSpaceBitmap.extLength; ftable = phd->freedSpaceTable.extLength; fmap = phd->freedSpaceBitmap.extLength; /* No allocation info? */ if (!utable && !umap && !ftable && !fmap) goto force_ro; /* We don't support blocks that require erasing before overwrite */ if (ftable || fmap) goto force_ro; /* UDF 2.60: 2.3.3 - no mixing of tables & bitmaps, no VAT. */ if (utable && umap) goto force_ro; if (map->s_partition_type == UDF_VIRTUAL_MAP15 || map->s_partition_type == UDF_VIRTUAL_MAP20 || map->s_partition_type == UDF_METADATA_MAP25) goto force_ro; return 0; force_ro: if (!sb_rdonly(sb)) return -EACCES; UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT); return 0; } static int udf_fill_partdesc_info(struct super_block *sb, struct partitionDesc *p, int p_index) { struct udf_part_map *map; struct udf_sb_info *sbi = UDF_SB(sb); struct partitionHeaderDesc *phd; u32 sum; int err; map = &sbi->s_partmaps[p_index]; map->s_partition_len = le32_to_cpu(p->partitionLength); /* blocks */ map->s_partition_root = le32_to_cpu(p->partitionStartingLocation); if (check_add_overflow(map->s_partition_root, map->s_partition_len, &sum)) { udf_err(sb, "Partition %d has invalid location %u + %u\n", p_index, map->s_partition_root, map->s_partition_len); return -EFSCORRUPTED; } if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_READ_ONLY)) map->s_partition_flags |= UDF_PART_FLAG_READ_ONLY; if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_WRITE_ONCE)) map->s_partition_flags |= UDF_PART_FLAG_WRITE_ONCE; if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_REWRITABLE)) map->s_partition_flags |= UDF_PART_FLAG_REWRITABLE; if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_OVERWRITABLE)) map->s_partition_flags |= UDF_PART_FLAG_OVERWRITABLE; udf_debug("Partition (%d type %x) starts at physical %u, block length %u\n", p_index, map->s_partition_type, map->s_partition_root, map->s_partition_len); err = check_partition_desc(sb, p, map); if (err) return err; /* * Skip loading allocation info it we cannot ever write to the fs. * This is a correctness thing as we may have decided to force ro mount * to avoid allocation info we don't support. */ if (UDF_QUERY_FLAG(sb, UDF_FLAG_RW_INCOMPAT)) return 0; phd = (struct partitionHeaderDesc *)p->partitionContentsUse; if (phd->unallocSpaceTable.extLength) { struct kernel_lb_addr loc = { .logicalBlockNum = le32_to_cpu( phd->unallocSpaceTable.extPosition), .partitionReferenceNum = p_index, }; struct inode *inode; inode = udf_iget_special(sb, &loc); if (IS_ERR(inode)) { udf_debug("cannot load unallocSpaceTable (part %d)\n", p_index); return PTR_ERR(inode); } map->s_uspace.s_table = inode; map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE; udf_debug("unallocSpaceTable (part %d) @ %lu\n", p_index, map->s_uspace.s_table->i_ino); } if (phd->unallocSpaceBitmap.extLength) { struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index); if (!bitmap) return -ENOMEM; map->s_uspace.s_bitmap = bitmap; bitmap->s_extPosition = le32_to_cpu( phd->unallocSpaceBitmap.extPosition); map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP; /* Check whether math over bitmap won't overflow. */ if (check_add_overflow(map->s_partition_len, sizeof(struct spaceBitmapDesc) << 3, &sum)) { udf_err(sb, "Partition %d is too long (%u)\n", p_index, map->s_partition_len); return -EFSCORRUPTED; } udf_debug("unallocSpaceBitmap (part %d) @ %u\n", p_index, bitmap->s_extPosition); } return 0; } static void udf_find_vat_block(struct super_block *sb, int p_index, int type1_index, sector_t start_block) { struct udf_sb_info *sbi = UDF_SB(sb); struct udf_part_map *map = &sbi->s_partmaps[p_index]; sector_t vat_block; struct kernel_lb_addr ino; struct inode *inode; /* * VAT file entry is in the last recorded block. Some broken disks have * it a few blocks before so try a bit harder... */ ino.partitionReferenceNum = type1_index; for (vat_block = start_block; vat_block >= map->s_partition_root && vat_block >= start_block - 3; vat_block--) { ino.logicalBlockNum = vat_block - map->s_partition_root; inode = udf_iget_special(sb, &ino); if (!IS_ERR(inode)) { sbi->s_vat_inode = inode; break; } } } static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) { struct udf_sb_info *sbi = UDF_SB(sb); struct udf_part_map *map = &sbi->s_partmaps[p_index]; struct buffer_head *bh = NULL; struct udf_inode_info *vati; struct virtualAllocationTable20 *vat20; sector_t blocks = sb_bdev_nr_blocks(sb); udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block); if (!sbi->s_vat_inode && sbi->s_last_block != blocks - 1) { pr_notice("Failed to read VAT inode from the last recorded block (%lu), retrying with the last block of the device (%lu).\n", (unsigned long)sbi->s_last_block, (unsigned long)blocks - 1); udf_find_vat_block(sb, p_index, type1_index, blocks - 1); } if (!sbi->s_vat_inode) return -EIO; if (map->s_partition_type == UDF_VIRTUAL_MAP15) { map->s_type_specific.s_virtual.s_start_offset = 0; map->s_type_specific.s_virtual.s_num_entries = (sbi->s_vat_inode->i_size - 36) >> 2; } else if (map->s_partition_type == UDF_VIRTUAL_MAP20) { vati = UDF_I(sbi->s_vat_inode); if (vati->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { int err = 0; bh = udf_bread(sbi->s_vat_inode, 0, 0, &err); if (!bh) { if (!err) err = -EFSCORRUPTED; return err; } vat20 = (struct virtualAllocationTable20 *)bh->b_data; } else { vat20 = (struct virtualAllocationTable20 *) vati->i_data; } map->s_type_specific.s_virtual.s_start_offset = le16_to_cpu(vat20->lengthHeader); map->s_type_specific.s_virtual.s_num_entries = (sbi->s_vat_inode->i_size - map->s_type_specific.s_virtual. s_start_offset) >> 2; brelse(bh); } return 0; } /* * Load partition descriptor block * * Returns <0 on error, 0 on success, -EAGAIN is special - try next descriptor * sequence. */ static int udf_load_partdesc(struct super_block *sb, sector_t block) { struct buffer_head *bh; struct partitionDesc *p; struct udf_part_map *map; struct udf_sb_info *sbi = UDF_SB(sb); int i, type1_idx; uint16_t partitionNumber; uint16_t ident; int ret; bh = udf_read_tagged(sb, block, block, &ident); if (!bh) return -EAGAIN; if (ident != TAG_IDENT_PD) { ret = 0; goto out_bh; } p = (struct partitionDesc *)bh->b_data; partitionNumber = le16_to_cpu(p->partitionNumber); /* First scan for TYPE1 and SPARABLE partitions */ for (i = 0; i < sbi->s_partitions; i++) { map = &sbi->s_partmaps[i]; udf_debug("Searching map: (%u == %u)\n", map->s_partition_num, partitionNumber); if (map->s_partition_num == partitionNumber && (map->s_partition_type == UDF_TYPE1_MAP15 || map->s_partition_type == UDF_SPARABLE_MAP15)) break; } if (i >= sbi->s_partitions) { udf_debug("Partition (%u) not found in partition map\n", partitionNumber); ret = 0; goto out_bh; } ret = udf_fill_partdesc_info(sb, p, i); if (ret < 0) goto out_bh; /* * Now rescan for VIRTUAL or METADATA partitions when SPARABLE and * PHYSICAL partitions are already set up */ type1_idx = i; map = NULL; /* supress 'maybe used uninitialized' warning */ for (i = 0; i < sbi->s_partitions; i++) { map = &sbi->s_partmaps[i]; if (map->s_partition_num == partitionNumber && (map->s_partition_type == UDF_VIRTUAL_MAP15 || map->s_partition_type == UDF_VIRTUAL_MAP20 || map->s_partition_type == UDF_METADATA_MAP25)) break; } if (i >= sbi->s_partitions) { ret = 0; goto out_bh; } ret = udf_fill_partdesc_info(sb, p, i); if (ret < 0) goto out_bh; if (map->s_partition_type == UDF_METADATA_MAP25) { ret = udf_load_metadata_files(sb, i, type1_idx); if (ret < 0) { udf_err(sb, "error loading MetaData partition map %d\n", i); goto out_bh; } } else { /* * If we have a partition with virtual map, we don't handle * writing to it (we overwrite blocks instead of relocating * them). */ if (!sb_rdonly(sb)) { ret = -EACCES; goto out_bh; } UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT); ret = udf_load_vat(sb, i, type1_idx); if (ret < 0) goto out_bh; } ret = 0; out_bh: /* In case loading failed, we handle cleanup in udf_fill_super */ brelse(bh); return ret; } static int udf_load_sparable_map(struct super_block *sb, struct udf_part_map *map, struct sparablePartitionMap *spm) { uint32_t loc; uint16_t ident; struct sparingTable *st; struct udf_sparing_data *sdata = &map->s_type_specific.s_sparing; int i; struct buffer_head *bh; map->s_partition_type = UDF_SPARABLE_MAP15; sdata->s_packet_len = le16_to_cpu(spm->packetLength); if (!is_power_of_2(sdata->s_packet_len)) { udf_err(sb, "error loading logical volume descriptor: " "Invalid packet length %u\n", (unsigned)sdata->s_packet_len); return -EIO; } if (spm->numSparingTables > 4) { udf_err(sb, "error loading logical volume descriptor: " "Too many sparing tables (%d)\n", (int)spm->numSparingTables); return -EIO; } if (le32_to_cpu(spm->sizeSparingTable) > sb->s_blocksize) { udf_err(sb, "error loading logical volume descriptor: " "Too big sparing table size (%u)\n", le32_to_cpu(spm->sizeSparingTable)); return -EIO; } for (i = 0; i < spm->numSparingTables; i++) { loc = le32_to_cpu(spm->locSparingTable[i]); bh = udf_read_tagged(sb, loc, loc, &ident); if (!bh) continue; st = (struct sparingTable *)bh->b_data; if (ident != 0 || strncmp(st->sparingIdent.ident, UDF_ID_SPARING, strlen(UDF_ID_SPARING)) || sizeof(*st) + le16_to_cpu(st->reallocationTableLen) > sb->s_blocksize) { brelse(bh); continue; } sdata->s_spar_map[i] = bh; } map->s_partition_func = udf_get_pblock_spar15; return 0; } static int udf_load_logicalvol(struct super_block *sb, sector_t block, struct kernel_lb_addr *fileset) { struct logicalVolDesc *lvd; int i, offset; uint8_t type; struct udf_sb_info *sbi = UDF_SB(sb); struct genericPartitionMap *gpm; uint16_t ident; struct buffer_head *bh; unsigned int table_len, part_map_count; int ret; bh = udf_read_tagged(sb, block, block, &ident); if (!bh) return -EAGAIN; BUG_ON(ident != TAG_IDENT_LVD); lvd = (struct logicalVolDesc *)bh->b_data; table_len = le32_to_cpu(lvd->mapTableLength); if (table_len > sb->s_blocksize - sizeof(*lvd)) { udf_err(sb, "error loading logical volume descriptor: " "Partition table too long (%u > %lu)\n", table_len, sb->s_blocksize - sizeof(*lvd)); ret = -EIO; goto out_bh; } ret = udf_verify_domain_identifier(sb, &lvd->domainIdent, "logical volume"); if (ret) goto out_bh; part_map_count = le32_to_cpu(lvd->numPartitionMaps); if (part_map_count > table_len / sizeof(struct genericPartitionMap1)) { udf_err(sb, "error loading logical volume descriptor: " "Too many partition maps (%u > %u)\n", part_map_count, table_len / (unsigned)sizeof(struct genericPartitionMap1)); ret = -EIO; goto out_bh; } ret = udf_sb_alloc_partition_maps(sb, part_map_count); if (ret) goto out_bh; for (i = 0, offset = 0; i < sbi->s_partitions && offset < table_len; i++, offset += gpm->partitionMapLength) { struct udf_part_map *map = &sbi->s_partmaps[i]; gpm = (struct genericPartitionMap *) &(lvd->partitionMaps[offset]); type = gpm->partitionMapType; if (type == 1) { struct genericPartitionMap1 *gpm1 = (struct genericPartitionMap1 *)gpm; map->s_partition_type = UDF_TYPE1_MAP15; map->s_volumeseqnum = le16_to_cpu(gpm1->volSeqNum); map->s_partition_num = le16_to_cpu(gpm1->partitionNum); map->s_partition_func = NULL; } else if (type == 2) { struct udfPartitionMap2 *upm2 = (struct udfPartitionMap2 *)gpm; if (!strncmp(upm2->partIdent.ident, UDF_ID_VIRTUAL, strlen(UDF_ID_VIRTUAL))) { u16 suf = le16_to_cpu(((__le16 *)upm2->partIdent. identSuffix)[0]); if (suf < 0x0200) { map->s_partition_type = UDF_VIRTUAL_MAP15; map->s_partition_func = udf_get_pblock_virt15; } else { map->s_partition_type = UDF_VIRTUAL_MAP20; map->s_partition_func = udf_get_pblock_virt20; } } else if (!strncmp(upm2->partIdent.ident, UDF_ID_SPARABLE, strlen(UDF_ID_SPARABLE))) { ret = udf_load_sparable_map(sb, map, (struct sparablePartitionMap *)gpm); if (ret < 0) goto out_bh; } else if (!strncmp(upm2->partIdent.ident, UDF_ID_METADATA, strlen(UDF_ID_METADATA))) { struct udf_meta_data *mdata = &map->s_type_specific.s_metadata; struct metadataPartitionMap *mdm = (struct metadataPartitionMap *) &(lvd->partitionMaps[offset]); udf_debug("Parsing Logical vol part %d type %u id=%s\n", i, type, UDF_ID_METADATA); map->s_partition_type = UDF_METADATA_MAP25; map->s_partition_func = udf_get_pblock_meta25; mdata->s_meta_file_loc = le32_to_cpu(mdm->metadataFileLoc); mdata->s_mirror_file_loc = le32_to_cpu(mdm->metadataMirrorFileLoc); mdata->s_bitmap_file_loc = le32_to_cpu(mdm->metadataBitmapFileLoc); mdata->s_alloc_unit_size = le32_to_cpu(mdm->allocUnitSize); mdata->s_align_unit_size = le16_to_cpu(mdm->alignUnitSize); if (mdm->flags & 0x01) mdata->s_flags |= MF_DUPLICATE_MD; udf_debug("Metadata Ident suffix=0x%x\n", le16_to_cpu(*(__le16 *) mdm->partIdent.identSuffix)); udf_debug("Metadata part num=%u\n", le16_to_cpu(mdm->partitionNum)); udf_debug("Metadata part alloc unit size=%u\n", le32_to_cpu(mdm->allocUnitSize)); udf_debug("Metadata file loc=%u\n", le32_to_cpu(mdm->metadataFileLoc)); udf_debug("Mirror file loc=%u\n", le32_to_cpu(mdm->metadataMirrorFileLoc)); udf_debug("Bitmap file loc=%u\n", le32_to_cpu(mdm->metadataBitmapFileLoc)); udf_debug("Flags: %d %u\n", mdata->s_flags, mdm->flags); } else { udf_debug("Unknown ident: %s\n", upm2->partIdent.ident); continue; } map->s_volumeseqnum = le16_to_cpu(upm2->volSeqNum); map->s_partition_num = le16_to_cpu(upm2->partitionNum); } udf_debug("Partition (%d:%u) type %u on volume %u\n", i, map->s_partition_num, type, map->s_volumeseqnum); } if (fileset) { struct long_ad *la = (struct long_ad *)&(lvd->logicalVolContentsUse[0]); *fileset = lelb_to_cpu(la->extLocation); udf_debug("FileSet found in LogicalVolDesc at block=%u, partition=%u\n", fileset->logicalBlockNum, fileset->partitionReferenceNum); } if (lvd->integritySeqExt.extLength) udf_load_logicalvolint(sb, leea_to_cpu(lvd->integritySeqExt)); ret = 0; if (!sbi->s_lvid_bh) { /* We can't generate unique IDs without a valid LVID */ if (sb_rdonly(sb)) { UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT); } else { udf_warn(sb, "Damaged or missing LVID, forcing " "readonly mount\n"); ret = -EACCES; } } out_bh: brelse(bh); return ret; } static bool udf_lvid_valid(struct super_block *sb, struct logicalVolIntegrityDesc *lvid) { u32 parts, impuselen; parts = le32_to_cpu(lvid->numOfPartitions); impuselen = le32_to_cpu(lvid->lengthOfImpUse); if (parts >= sb->s_blocksize || impuselen >= sb->s_blocksize || sizeof(struct logicalVolIntegrityDesc) + impuselen + 2 * parts * sizeof(u32) > sb->s_blocksize) return false; return true; } /* * Find the prevailing Logical Volume Integrity Descriptor. */ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ad loc) { struct buffer_head *bh, *final_bh; uint16_t ident; struct udf_sb_info *sbi = UDF_SB(sb); struct logicalVolIntegrityDesc *lvid; int indirections = 0; while (++indirections <= UDF_MAX_LVID_NESTING) { final_bh = NULL; while (loc.extLength > 0 && (bh = udf_read_tagged(sb, loc.extLocation, loc.extLocation, &ident))) { if (ident != TAG_IDENT_LVID) { brelse(bh); break; } brelse(final_bh); final_bh = bh; loc.extLength -= sb->s_blocksize; loc.extLocation++; } if (!final_bh) return; lvid = (struct logicalVolIntegrityDesc *)final_bh->b_data; if (udf_lvid_valid(sb, lvid)) { brelse(sbi->s_lvid_bh); sbi->s_lvid_bh = final_bh; } else { udf_warn(sb, "Corrupted LVID (parts=%u, impuselen=%u), " "ignoring.\n", le32_to_cpu(lvid->numOfPartitions), le32_to_cpu(lvid->lengthOfImpUse)); } if (lvid->nextIntegrityExt.extLength == 0) return; loc = leea_to_cpu(lvid->nextIntegrityExt); } udf_warn(sb, "Too many LVID indirections (max %u), ignoring.\n", UDF_MAX_LVID_NESTING); brelse(sbi->s_lvid_bh); sbi->s_lvid_bh = NULL; } /* * Step for reallocation of table of partition descriptor sequence numbers. * Must be power of 2. */ #define PART_DESC_ALLOC_STEP 32 struct part_desc_seq_scan_data { struct udf_vds_record rec; u32 partnum; }; struct desc_seq_scan_data { struct udf_vds_record vds[VDS_POS_LENGTH]; unsigned int size_part_descs; unsigned int num_part_descs; struct part_desc_seq_scan_data *part_descs_loc; }; static struct udf_vds_record *handle_partition_descriptor( struct buffer_head *bh, struct desc_seq_scan_data *data) { struct partitionDesc *desc = (struct partitionDesc *)bh->b_data; int partnum; int i; partnum = le16_to_cpu(desc->partitionNumber); for (i = 0; i < data->num_part_descs; i++) if (partnum == data->part_descs_loc[i].partnum) return &(data->part_descs_loc[i].rec); if (data->num_part_descs >= data->size_part_descs) { struct part_desc_seq_scan_data *new_loc; unsigned int new_size = ALIGN(partnum, PART_DESC_ALLOC_STEP); new_loc = kcalloc(new_size, sizeof(*new_loc), GFP_KERNEL); if (!new_loc) return ERR_PTR(-ENOMEM); memcpy(new_loc, data->part_descs_loc, data->size_part_descs * sizeof(*new_loc)); kfree(data->part_descs_loc); data->part_descs_loc = new_loc; data->size_part_descs = new_size; } return &(data->part_descs_loc[data->num_part_descs++].rec); } static struct udf_vds_record *get_volume_descriptor_record(uint16_t ident, struct buffer_head *bh, struct desc_seq_scan_data *data) { switch (ident) { case TAG_IDENT_PVD: /* ISO 13346 3/10.1 */ return &(data->vds[VDS_POS_PRIMARY_VOL_DESC]); case TAG_IDENT_IUVD: /* ISO 13346 3/10.4 */ return &(data->vds[VDS_POS_IMP_USE_VOL_DESC]); case TAG_IDENT_LVD: /* ISO 13346 3/10.6 */ return &(data->vds[VDS_POS_LOGICAL_VOL_DESC]); case TAG_IDENT_USD: /* ISO 13346 3/10.8 */ return &(data->vds[VDS_POS_UNALLOC_SPACE_DESC]); case TAG_IDENT_PD: /* ISO 13346 3/10.5 */ return handle_partition_descriptor(bh, data); } return NULL; } /* * Process a main/reserve volume descriptor sequence. * @block First block of first extent of the sequence. * @lastblock Lastblock of first extent of the sequence. * @fileset There we store extent containing root fileset * * Returns <0 on error, 0 on success. -EAGAIN is special - try next descriptor * sequence */ static noinline int udf_process_sequence( struct super_block *sb, sector_t block, sector_t lastblock, struct kernel_lb_addr *fileset) { struct buffer_head *bh = NULL; struct udf_vds_record *curr; struct generic_desc *gd; struct volDescPtr *vdp; bool done = false; uint32_t vdsn; uint16_t ident; int ret; unsigned int indirections = 0; struct desc_seq_scan_data data; unsigned int i; memset(data.vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH); data.size_part_descs = PART_DESC_ALLOC_STEP; data.num_part_descs = 0; data.part_descs_loc = kcalloc(data.size_part_descs, sizeof(*data.part_descs_loc), GFP_KERNEL); if (!data.part_descs_loc) return -ENOMEM; /* * Read the main descriptor sequence and find which descriptors * are in it. */ for (; (!done && block <= lastblock); block++) { bh = udf_read_tagged(sb, block, block, &ident); if (!bh) break; /* Process each descriptor (ISO 13346 3/8.3-8.4) */ gd = (struct generic_desc *)bh->b_data; vdsn = le32_to_cpu(gd->volDescSeqNum); switch (ident) { case TAG_IDENT_VDP: /* ISO 13346 3/10.3 */ if (++indirections > UDF_MAX_TD_NESTING) { udf_err(sb, "too many Volume Descriptor " "Pointers (max %u supported)\n", UDF_MAX_TD_NESTING); brelse(bh); ret = -EIO; goto out; } vdp = (struct volDescPtr *)bh->b_data; block = le32_to_cpu(vdp->nextVolDescSeqExt.extLocation); lastblock = le32_to_cpu( vdp->nextVolDescSeqExt.extLength) >> sb->s_blocksize_bits; lastblock += block - 1; /* For loop is going to increment 'block' again */ block--; break; case TAG_IDENT_PVD: /* ISO 13346 3/10.1 */ case TAG_IDENT_IUVD: /* ISO 13346 3/10.4 */ case TAG_IDENT_LVD: /* ISO 13346 3/10.6 */ case TAG_IDENT_USD: /* ISO 13346 3/10.8 */ case TAG_IDENT_PD: /* ISO 13346 3/10.5 */ curr = get_volume_descriptor_record(ident, bh, &data); if (IS_ERR(curr)) { brelse(bh); ret = PTR_ERR(curr); goto out; } /* Descriptor we don't care about? */ if (!curr) break; if (vdsn >= curr->volDescSeqNum) { curr->volDescSeqNum = vdsn; curr->block = block; } break; case TAG_IDENT_TD: /* ISO 13346 3/10.9 */ done = true; break; } brelse(bh); } /* * Now read interesting descriptors again and process them * in a suitable order */ if (!data.vds[VDS_POS_PRIMARY_VOL_DESC].block) { udf_err(sb, "Primary Volume Descriptor not found!\n"); ret = -EAGAIN; goto out; } ret = udf_load_pvoldesc(sb, data.vds[VDS_POS_PRIMARY_VOL_DESC].block); if (ret < 0) goto out; if (data.vds[VDS_POS_LOGICAL_VOL_DESC].block) { ret = udf_load_logicalvol(sb, data.vds[VDS_POS_LOGICAL_VOL_DESC].block, fileset); if (ret < 0) goto out; } /* Now handle prevailing Partition Descriptors */ for (i = 0; i < data.num_part_descs; i++) { ret = udf_load_partdesc(sb, data.part_descs_loc[i].rec.block); if (ret < 0) goto out; } ret = 0; out: kfree(data.part_descs_loc); return ret; } /* * Load Volume Descriptor Sequence described by anchor in bh * * Returns <0 on error, 0 on success */ static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh, struct kernel_lb_addr *fileset) { struct anchorVolDescPtr *anchor; sector_t main_s, main_e, reserve_s, reserve_e; int ret; anchor = (struct anchorVolDescPtr *)bh->b_data; /* Locate the main sequence */ main_s = le32_to_cpu(anchor->mainVolDescSeqExt.extLocation); main_e = le32_to_cpu(anchor->mainVolDescSeqExt.extLength); main_e = main_e >> sb->s_blocksize_bits; main_e += main_s - 1; /* Locate the reserve sequence */ reserve_s = le32_to_cpu(anchor->reserveVolDescSeqExt.extLocation); reserve_e = le32_to_cpu(anchor->reserveVolDescSeqExt.extLength); reserve_e = reserve_e >> sb->s_blocksize_bits; reserve_e += reserve_s - 1; /* Process the main & reserve sequences */ /* responsible for finding the PartitionDesc(s) */ ret = udf_process_sequence(sb, main_s, main_e, fileset); if (ret != -EAGAIN) return ret; udf_sb_free_partitions(sb); ret = udf_process_sequence(sb, reserve_s, reserve_e, fileset); if (ret < 0) { udf_sb_free_partitions(sb); /* No sequence was OK, return -EIO */ if (ret == -EAGAIN) ret = -EIO; } return ret; } /* * Check whether there is an anchor block in the given block and * load Volume Descriptor Sequence if so. * * Returns <0 on error, 0 on success, -EAGAIN is special - try next anchor * block */ static int udf_check_anchor_block(struct super_block *sb, sector_t block, struct kernel_lb_addr *fileset) { struct buffer_head *bh; uint16_t ident; int ret; bh = udf_read_tagged(sb, block, block, &ident); if (!bh) return -EAGAIN; if (ident != TAG_IDENT_AVDP) { brelse(bh); return -EAGAIN; } ret = udf_load_sequence(sb, bh, fileset); brelse(bh); return ret; } /* * Search for an anchor volume descriptor pointer. * * Returns < 0 on error, 0 on success. -EAGAIN is special - try next set * of anchors. */ static int udf_scan_anchors(struct super_block *sb, udf_pblk_t *lastblock, struct kernel_lb_addr *fileset) { udf_pblk_t last[6]; int i; struct udf_sb_info *sbi = UDF_SB(sb); int last_count = 0; int ret; /* First try user provided anchor */ if (sbi->s_anchor) { ret = udf_check_anchor_block(sb, sbi->s_anchor, fileset); if (ret != -EAGAIN) return ret; } /* * according to spec, anchor is in either: * block 256 * lastblock-256 * lastblock * however, if the disc isn't closed, it could be 512. */ ret = udf_check_anchor_block(sb, sbi->s_session + 256, fileset); if (ret != -EAGAIN) return ret; /* * The trouble is which block is the last one. Drives often misreport * this so we try various possibilities. */ last[last_count++] = *lastblock; if (*lastblock >= 1) last[last_count++] = *lastblock - 1; last[last_count++] = *lastblock + 1; if (*lastblock >= 2) last[last_count++] = *lastblock - 2; if (*lastblock >= 150) last[last_count++] = *lastblock - 150; if (*lastblock >= 152) last[last_count++] = *lastblock - 152; for (i = 0; i < last_count; i++) { if (last[i] >= sb_bdev_nr_blocks(sb)) continue; ret = udf_check_anchor_block(sb, last[i], fileset); if (ret != -EAGAIN) { if (!ret) *lastblock = last[i]; return ret; } if (last[i] < 256) continue; ret = udf_check_anchor_block(sb, last[i] - 256, fileset); if (ret != -EAGAIN) { if (!ret) *lastblock = last[i]; return ret; } } /* Finally try block 512 in case media is open */ return udf_check_anchor_block(sb, sbi->s_session + 512, fileset); } /* * Check Volume Structure Descriptor, find Anchor block and load Volume * Descriptor Sequence. * * Returns < 0 on error, 0 on success. -EAGAIN is special meaning anchor * block was not found. */ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt, int silent, struct kernel_lb_addr *fileset) { struct udf_sb_info *sbi = UDF_SB(sb); int nsr = 0; int ret; if (!sb_set_blocksize(sb, uopt->blocksize)) { if (!silent) udf_warn(sb, "Bad block size\n"); return -EINVAL; } sbi->s_last_block = uopt->lastblock; if (!UDF_QUERY_FLAG(sb, UDF_FLAG_NOVRS)) { /* Check that it is NSR02 compliant */ nsr = udf_check_vsd(sb); if (!nsr) { if (!silent) udf_warn(sb, "No VRS found\n"); return -EINVAL; } if (nsr == -1) udf_debug("Failed to read sector at offset %d. " "Assuming open disc. Skipping validity " "check\n", VSD_FIRST_SECTOR_OFFSET); if (!sbi->s_last_block) sbi->s_last_block = udf_get_last_block(sb); } else { udf_debug("Validity check skipped because of novrs option\n"); } /* Look for anchor block and load Volume Descriptor Sequence */ sbi->s_anchor = uopt->anchor; ret = udf_scan_anchors(sb, &sbi->s_last_block, fileset); if (ret < 0) { if (!silent && ret == -EAGAIN) udf_warn(sb, "No anchor found\n"); return ret; } return 0; } static void udf_finalize_lvid(struct logicalVolIntegrityDesc *lvid) { struct timespec64 ts; ktime_get_real_ts64(&ts); udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts); lvid->descTag.descCRC = cpu_to_le16( crc_itu_t(0, (char *)lvid + sizeof(struct tag), le16_to_cpu(lvid->descTag.descCRCLength))); lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); } static void udf_open_lvid(struct super_block *sb) { struct udf_sb_info *sbi = UDF_SB(sb); struct buffer_head *bh = sbi->s_lvid_bh; struct logicalVolIntegrityDesc *lvid; struct logicalVolIntegrityDescImpUse *lvidiu; if (!bh) return; lvid = (struct logicalVolIntegrityDesc *)bh->b_data; lvidiu = udf_sb_lvidiu(sb); if (!lvidiu) return; mutex_lock(&sbi->s_alloc_mutex); lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; if (le32_to_cpu(lvid->integrityType) == LVID_INTEGRITY_TYPE_CLOSE) lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN); else UDF_SET_FLAG(sb, UDF_FLAG_INCONSISTENT); udf_finalize_lvid(lvid); mark_buffer_dirty(bh); sbi->s_lvid_dirty = 0; mutex_unlock(&sbi->s_alloc_mutex); /* Make opening of filesystem visible on the media immediately */ sync_dirty_buffer(bh); } static void udf_close_lvid(struct super_block *sb) { struct udf_sb_info *sbi = UDF_SB(sb); struct buffer_head *bh = sbi->s_lvid_bh; struct logicalVolIntegrityDesc *lvid; struct logicalVolIntegrityDescImpUse *lvidiu; if (!bh) return; lvid = (struct logicalVolIntegrityDesc *)bh->b_data; lvidiu = udf_sb_lvidiu(sb); if (!lvidiu) return; mutex_lock(&sbi->s_alloc_mutex); lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; if (UDF_MAX_WRITE_VERSION > le16_to_cpu(lvidiu->maxUDFWriteRev)) lvidiu->maxUDFWriteRev = cpu_to_le16(UDF_MAX_WRITE_VERSION); if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFReadRev)) lvidiu->minUDFReadRev = cpu_to_le16(sbi->s_udfrev); if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFWriteRev)) lvidiu->minUDFWriteRev = cpu_to_le16(sbi->s_udfrev); if (!UDF_QUERY_FLAG(sb, UDF_FLAG_INCONSISTENT)) lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE); /* * We set buffer uptodate unconditionally here to avoid spurious * warnings from mark_buffer_dirty() when previous EIO has marked * the buffer as !uptodate */ set_buffer_uptodate(bh); udf_finalize_lvid(lvid); mark_buffer_dirty(bh); sbi->s_lvid_dirty = 0; mutex_unlock(&sbi->s_alloc_mutex); /* Make closing of filesystem visible on the media immediately */ sync_dirty_buffer(bh); } u64 lvid_get_unique_id(struct super_block *sb) { struct buffer_head *bh; struct udf_sb_info *sbi = UDF_SB(sb); struct logicalVolIntegrityDesc *lvid; struct logicalVolHeaderDesc *lvhd; u64 uniqueID; u64 ret; bh = sbi->s_lvid_bh; if (!bh) return 0; lvid = (struct logicalVolIntegrityDesc *)bh->b_data; lvhd = (struct logicalVolHeaderDesc *)lvid->logicalVolContentsUse; mutex_lock(&sbi->s_alloc_mutex); ret = uniqueID = le64_to_cpu(lvhd->uniqueID); if (!(++uniqueID & 0xFFFFFFFF)) uniqueID += 16; lvhd->uniqueID = cpu_to_le64(uniqueID); udf_updated_lvid(sb); mutex_unlock(&sbi->s_alloc_mutex); return ret; } static int udf_fill_super(struct super_block *sb, struct fs_context *fc) { int ret = -EINVAL; struct inode *inode = NULL; struct udf_options *uopt = fc->fs_private; struct kernel_lb_addr rootdir, fileset; struct udf_sb_info *sbi; bool lvid_open = false; int silent = fc->sb_flags & SB_SILENT; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; sb->s_fs_info = sbi; mutex_init(&sbi->s_alloc_mutex); fileset.logicalBlockNum = 0xFFFFFFFF; fileset.partitionReferenceNum = 0xFFFF; sbi->s_flags = uopt->flags; sbi->s_uid = uopt->uid; sbi->s_gid = uopt->gid; sbi->s_umask = uopt->umask; sbi->s_fmode = uopt->fmode; sbi->s_dmode = uopt->dmode; sbi->s_nls_map = uopt->nls_map; uopt->nls_map = NULL; rwlock_init(&sbi->s_cred_lock); if (uopt->session == 0xFFFFFFFF) sbi->s_session = udf_get_last_session(sb); else sbi->s_session = uopt->session; udf_debug("Multi-session=%d\n", sbi->s_session); /* Fill in the rest of the superblock */ sb->s_op = &udf_sb_ops; sb->s_export_op = &udf_export_ops; sb->s_magic = UDF_SUPER_MAGIC; sb->s_time_gran = 1000; if (uopt->flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) { ret = udf_load_vrs(sb, uopt, silent, &fileset); } else { uopt->blocksize = bdev_logical_block_size(sb->s_bdev); while (uopt->blocksize <= 4096) { ret = udf_load_vrs(sb, uopt, silent, &fileset); if (ret < 0) { if (!silent && ret != -EACCES) { pr_notice("Scanning with blocksize %u failed\n", uopt->blocksize); } brelse(sbi->s_lvid_bh); sbi->s_lvid_bh = NULL; /* * EACCES is special - we want to propagate to * upper layers that we cannot handle RW mount. */ if (ret == -EACCES) break; } else break; uopt->blocksize <<= 1; } } if (ret < 0) { if (ret == -EAGAIN) { udf_warn(sb, "No partition found (1)\n"); ret = -EINVAL; } goto error_out; } udf_debug("Lastblock=%u\n", sbi->s_last_block); if (sbi->s_lvid_bh) { struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb); uint16_t minUDFReadRev; uint16_t minUDFWriteRev; if (!lvidiu) { ret = -EINVAL; goto error_out; } minUDFReadRev = le16_to_cpu(lvidiu->minUDFReadRev); minUDFWriteRev = le16_to_cpu(lvidiu->minUDFWriteRev); if (minUDFReadRev > UDF_MAX_READ_VERSION) { udf_err(sb, "minUDFReadRev=%x (max is %x)\n", minUDFReadRev, UDF_MAX_READ_VERSION); ret = -EINVAL; goto error_out; } else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION) { if (!sb_rdonly(sb)) { ret = -EACCES; goto error_out; } UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT); } sbi->s_udfrev = minUDFWriteRev; if (minUDFReadRev >= UDF_VERS_USE_EXTENDED_FE) UDF_SET_FLAG(sb, UDF_FLAG_USE_EXTENDED_FE); if (minUDFReadRev >= UDF_VERS_USE_STREAMS) UDF_SET_FLAG(sb, UDF_FLAG_USE_STREAMS); } if (!sbi->s_partitions) { udf_warn(sb, "No partition found (2)\n"); ret = -EINVAL; goto error_out; } if (sbi->s_partmaps[sbi->s_partition].s_partition_flags & UDF_PART_FLAG_READ_ONLY) { if (!sb_rdonly(sb)) { ret = -EACCES; goto error_out; } UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT); } ret = udf_find_fileset(sb, &fileset, &rootdir); if (ret < 0) { udf_warn(sb, "No fileset found\n"); goto error_out; } if (!silent) { struct timestamp ts; udf_time_to_disk_stamp(&ts, sbi->s_record_time); udf_info("Mounting volume '%s', timestamp %04u/%02u/%02u %02u:%02u (%x)\n", sbi->s_volume_ident, le16_to_cpu(ts.year), ts.month, ts.day, ts.hour, ts.minute, le16_to_cpu(ts.typeAndTimezone)); } if (!sb_rdonly(sb)) { udf_open_lvid(sb); lvid_open = true; } /* Assign the root inode */ /* assign inodes by physical block number */ /* perhaps it's not extensible enough, but for now ... */ inode = udf_iget(sb, &rootdir); if (IS_ERR(inode)) { udf_err(sb, "Error in udf_iget, block=%u, partition=%u\n", rootdir.logicalBlockNum, rootdir.partitionReferenceNum); ret = PTR_ERR(inode); goto error_out; } /* Allocate a dentry for the root inode */ sb->s_root = d_make_root(inode); if (!sb->s_root) { udf_err(sb, "Couldn't allocate root dentry\n"); ret = -ENOMEM; goto error_out; } sb->s_maxbytes = UDF_MAX_FILESIZE; sb->s_max_links = UDF_MAX_LINKS; return 0; error_out: iput(sbi->s_vat_inode); unload_nls(uopt->nls_map); if (lvid_open) udf_close_lvid(sb); brelse(sbi->s_lvid_bh); udf_sb_free_partitions(sb); kfree(sbi); sb->s_fs_info = NULL; return ret; } void _udf_err(struct super_block *sb, const char *function, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; pr_err("error (device %s): %s: %pV", sb->s_id, function, &vaf); va_end(args); } void _udf_warn(struct super_block *sb, const char *function, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; pr_warn("warning (device %s): %s: %pV", sb->s_id, function, &vaf); va_end(args); } static void udf_put_super(struct super_block *sb) { struct udf_sb_info *sbi; sbi = UDF_SB(sb); iput(sbi->s_vat_inode); unload_nls(sbi->s_nls_map); if (!sb_rdonly(sb)) udf_close_lvid(sb); brelse(sbi->s_lvid_bh); udf_sb_free_partitions(sb); mutex_destroy(&sbi->s_alloc_mutex); kfree(sb->s_fs_info); sb->s_fs_info = NULL; } static int udf_sync_fs(struct super_block *sb, int wait) { struct udf_sb_info *sbi = UDF_SB(sb); mutex_lock(&sbi->s_alloc_mutex); if (sbi->s_lvid_dirty) { struct buffer_head *bh = sbi->s_lvid_bh; struct logicalVolIntegrityDesc *lvid; lvid = (struct logicalVolIntegrityDesc *)bh->b_data; udf_finalize_lvid(lvid); /* * Blockdevice will be synced later so we don't have to submit * the buffer for IO */ mark_buffer_dirty(bh); sbi->s_lvid_dirty = 0; } mutex_unlock(&sbi->s_alloc_mutex); return 0; } static int udf_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct udf_sb_info *sbi = UDF_SB(sb); struct logicalVolIntegrityDescImpUse *lvidiu; u64 id = huge_encode_dev(sb->s_bdev->bd_dev); lvidiu = udf_sb_lvidiu(sb); buf->f_type = UDF_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = sbi->s_partmaps[sbi->s_partition].s_partition_len; buf->f_bfree = udf_count_free(sb); buf->f_bavail = buf->f_bfree; /* * Let's pretend each free block is also a free 'inode' since UDF does * not have separate preallocated table of inodes. */ buf->f_files = (lvidiu != NULL ? (le32_to_cpu(lvidiu->numFiles) + le32_to_cpu(lvidiu->numDirs)) : 0) + buf->f_bfree; buf->f_ffree = buf->f_bfree; buf->f_namelen = UDF_NAME_LEN; buf->f_fsid = u64_to_fsid(id); return 0; } static unsigned int udf_count_free_bitmap(struct super_block *sb, struct udf_bitmap *bitmap) { struct buffer_head *bh = NULL; unsigned int accum = 0; int index; udf_pblk_t block = 0, newblock; struct kernel_lb_addr loc; uint32_t bytes; uint8_t *ptr; uint16_t ident; struct spaceBitmapDesc *bm; loc.logicalBlockNum = bitmap->s_extPosition; loc.partitionReferenceNum = UDF_SB(sb)->s_partition; bh = udf_read_ptagged(sb, &loc, 0, &ident); if (!bh) { udf_err(sb, "udf_count_free failed\n"); goto out; } else if (ident != TAG_IDENT_SBD) { brelse(bh); udf_err(sb, "udf_count_free failed\n"); goto out; } bm = (struct spaceBitmapDesc *)bh->b_data; bytes = le32_to_cpu(bm->numOfBytes); index = sizeof(struct spaceBitmapDesc); /* offset in first block only */ ptr = (uint8_t *)bh->b_data; while (bytes > 0) { u32 cur_bytes = min_t(u32, bytes, sb->s_blocksize - index); accum += bitmap_weight((const unsigned long *)(ptr + index), cur_bytes * 8); bytes -= cur_bytes; if (bytes) { brelse(bh); newblock = udf_get_lb_pblock(sb, &loc, ++block); bh = sb_bread(sb, newblock); if (!bh) { udf_debug("read failed\n"); goto out; } index = 0; ptr = (uint8_t *)bh->b_data; } } brelse(bh); out: return accum; } static unsigned int udf_count_free_table(struct super_block *sb, struct inode *table) { unsigned int accum = 0; uint32_t elen; struct kernel_lb_addr eloc; struct extent_position epos; int8_t etype; mutex_lock(&UDF_SB(sb)->s_alloc_mutex); epos.block = UDF_I(table)->i_location; epos.offset = sizeof(struct unallocSpaceEntry); epos.bh = NULL; while (udf_next_aext(table, &epos, &eloc, &elen, &etype, 1) > 0) accum += (elen >> table->i_sb->s_blocksize_bits); brelse(epos.bh); mutex_unlock(&UDF_SB(sb)->s_alloc_mutex); return accum; } static unsigned int udf_count_free(struct super_block *sb) { unsigned int accum = 0; struct udf_sb_info *sbi = UDF_SB(sb); struct udf_part_map *map; unsigned int part = sbi->s_partition; int ptype = sbi->s_partmaps[part].s_partition_type; if (ptype == UDF_METADATA_MAP25) { part = sbi->s_partmaps[part].s_type_specific.s_metadata. s_phys_partition_ref; } else if (ptype == UDF_VIRTUAL_MAP15 || ptype == UDF_VIRTUAL_MAP20) { /* * Filesystems with VAT are append-only and we cannot write to * them. Let's just report 0 here. */ return 0; } if (sbi->s_lvid_bh) { struct logicalVolIntegrityDesc *lvid = (struct logicalVolIntegrityDesc *) sbi->s_lvid_bh->b_data; if (le32_to_cpu(lvid->numOfPartitions) > part) { accum = le32_to_cpu( lvid->freeSpaceTable[part]); if (accum == 0xFFFFFFFF) accum = 0; } } if (accum) return accum; map = &sbi->s_partmaps[part]; if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) { accum += udf_count_free_bitmap(sb, map->s_uspace.s_bitmap); } if (accum) return accum; if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) { accum += udf_count_free_table(sb, map->s_uspace.s_table); } return accum; } MODULE_AUTHOR("Ben Fennema"); MODULE_DESCRIPTION("Universal Disk Format Filesystem"); MODULE_LICENSE("GPL"); module_init(init_udf_fs) module_exit(exit_udf_fs)
10 10 10 19 1 19 19 19 19 1 1 19 18 18 18 10 23 23 23 23 23 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 /* * Route Plug-In * Copyright (c) 2000 by Abramo Bagnara <abramo@alsa-project.org> * * * This library is free software; you can redistribute it and/or modify * it under the terms of the GNU Library General Public License as * published by the Free Software Foundation; either version 2 of * the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ #include <linux/time.h> #include <sound/core.h> #include <sound/pcm.h> #include "pcm_plugin.h" static void zero_areas(struct snd_pcm_plugin_channel *dvp, int ndsts, snd_pcm_uframes_t frames, snd_pcm_format_t format) { int dst = 0; for (; dst < ndsts; ++dst) { if (dvp->wanted) snd_pcm_area_silence(&dvp->area, 0, frames, format); dvp->enabled = 0; dvp++; } } static inline void copy_area(const struct snd_pcm_plugin_channel *src_channel, struct snd_pcm_plugin_channel *dst_channel, snd_pcm_uframes_t frames, snd_pcm_format_t format) { dst_channel->enabled = 1; snd_pcm_area_copy(&src_channel->area, 0, &dst_channel->area, 0, frames, format); } static snd_pcm_sframes_t route_transfer(struct snd_pcm_plugin *plugin, const struct snd_pcm_plugin_channel *src_channels, struct snd_pcm_plugin_channel *dst_channels, snd_pcm_uframes_t frames) { int nsrcs, ndsts, dst; struct snd_pcm_plugin_channel *dvp; snd_pcm_format_t format; if (snd_BUG_ON(!plugin || !src_channels || !dst_channels)) return -ENXIO; if (frames == 0) return 0; if (frames > dst_channels[0].frames) frames = dst_channels[0].frames; nsrcs = plugin->src_format.channels; ndsts = plugin->dst_format.channels; format = plugin->dst_format.format; dvp = dst_channels; if (nsrcs <= 1) { /* expand to all channels */ for (dst = 0; dst < ndsts; ++dst) { copy_area(src_channels, dvp, frames, format); dvp++; } return frames; } for (dst = 0; dst < ndsts && dst < nsrcs; ++dst) { copy_area(src_channels, dvp, frames, format); dvp++; src_channels++; } if (dst < ndsts) zero_areas(dvp, ndsts - dst, frames, format); return frames; } int snd_pcm_plugin_build_route(struct snd_pcm_substream *plug, struct snd_pcm_plugin_format *src_format, struct snd_pcm_plugin_format *dst_format, struct snd_pcm_plugin **r_plugin) { struct snd_pcm_plugin *plugin; int err; if (snd_BUG_ON(!r_plugin)) return -ENXIO; *r_plugin = NULL; if (snd_BUG_ON(src_format->rate != dst_format->rate)) return -ENXIO; if (snd_BUG_ON(src_format->format != dst_format->format)) return -ENXIO; err = snd_pcm_plugin_build(plug, "route conversion", src_format, dst_format, 0, &plugin); if (err < 0) return err; plugin->transfer = route_transfer; *r_plugin = plugin; return 0; }
7 7 7 7 6 5 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 // SPDX-License-Identifier: GPL-2.0-or-later /* * SR-IPv6 implementation * * Author: * David Lebrun <david.lebrun@uclouvain.be> */ #include <linux/types.h> #include <linux/skbuff.h> #include <linux/net.h> #include <linux/module.h> #include <net/ip.h> #include <net/ip_tunnels.h> #include <net/lwtunnel.h> #include <net/netevent.h> #include <net/netns/generic.h> #include <net/ip6_fib.h> #include <net/route.h> #include <net/seg6.h> #include <linux/seg6.h> #include <linux/seg6_iptunnel.h> #include <net/addrconf.h> #include <net/ip6_route.h> #include <net/dst_cache.h> #ifdef CONFIG_IPV6_SEG6_HMAC #include <net/seg6_hmac.h> #endif #include <linux/netfilter.h> static size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo) { int head = 0; switch (tuninfo->mode) { case SEG6_IPTUN_MODE_INLINE: break; case SEG6_IPTUN_MODE_ENCAP: case SEG6_IPTUN_MODE_ENCAP_RED: head = sizeof(struct ipv6hdr); break; case SEG6_IPTUN_MODE_L2ENCAP: case SEG6_IPTUN_MODE_L2ENCAP_RED: return 0; } return ((tuninfo->srh->hdrlen + 1) << 3) + head; } struct seg6_lwt { struct dst_cache cache; struct seg6_iptunnel_encap tuninfo[]; }; static inline struct seg6_lwt *seg6_lwt_lwtunnel(struct lwtunnel_state *lwt) { return (struct seg6_lwt *)lwt->data; } static inline struct seg6_iptunnel_encap * seg6_encap_lwtunnel(struct lwtunnel_state *lwt) { return seg6_lwt_lwtunnel(lwt)->tuninfo; } static const struct nla_policy seg6_iptunnel_policy[SEG6_IPTUNNEL_MAX + 1] = { [SEG6_IPTUNNEL_SRH] = { .type = NLA_BINARY }, }; static int nla_put_srh(struct sk_buff *skb, int attrtype, struct seg6_iptunnel_encap *tuninfo) { struct seg6_iptunnel_encap *data; struct nlattr *nla; int len; len = SEG6_IPTUN_ENCAP_SIZE(tuninfo); nla = nla_reserve(skb, attrtype, len); if (!nla) return -EMSGSIZE; data = nla_data(nla); memcpy(data, tuninfo, len); return 0; } static void set_tun_src(struct net *net, struct net_device *dev, struct in6_addr *daddr, struct in6_addr *saddr) { struct seg6_pernet_data *sdata = seg6_pernet(net); struct in6_addr *tun_src; rcu_read_lock(); tun_src = rcu_dereference(sdata->tun_src); if (!ipv6_addr_any(tun_src)) { memcpy(saddr, tun_src, sizeof(struct in6_addr)); } else { ipv6_dev_get_saddr(net, dev, daddr, IPV6_PREFER_SRC_PUBLIC, saddr); } rcu_read_unlock(); } /* Compute flowlabel for outer IPv6 header */ static __be32 seg6_make_flowlabel(struct net *net, struct sk_buff *skb, struct ipv6hdr *inner_hdr) { int do_flowlabel = net->ipv6.sysctl.seg6_flowlabel; __be32 flowlabel = 0; u32 hash; if (do_flowlabel > 0) { hash = skb_get_hash(skb); hash = rol32(hash, 16); flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK; } else if (!do_flowlabel && skb->protocol == htons(ETH_P_IPV6)) { flowlabel = ip6_flowlabel(inner_hdr); } return flowlabel; } static int __seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto, struct dst_entry *cache_dst) { struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst_dev(dst); struct net *net = dev_net(dev); struct ipv6hdr *hdr, *inner_hdr; struct ipv6_sr_hdr *isrh; int hdrlen, tot_len, err; __be32 flowlabel; hdrlen = (osrh->hdrlen + 1) << 3; tot_len = hdrlen + sizeof(*hdr); err = skb_cow_head(skb, tot_len + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; inner_hdr = ipv6_hdr(skb); flowlabel = seg6_make_flowlabel(net, skb, inner_hdr); skb_push(skb, tot_len); skb_reset_network_header(skb); skb_mac_header_rebuild(skb); hdr = ipv6_hdr(skb); /* inherit tc, flowlabel and hlim * hlim will be decremented in ip6_forward() afterwards and * decapsulation will overwrite inner hlim with outer hlim */ if (skb->protocol == htons(ETH_P_IPV6)) { ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)), flowlabel); hdr->hop_limit = inner_hdr->hop_limit; } else { ip6_flow_hdr(hdr, 0, flowlabel); hdr->hop_limit = ip6_dst_hoplimit(skb_dst(skb)); memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); /* the control block has been erased, so we have to set the * iif once again. * We read the receiving interface index directly from the * skb->skb_iif as it is done in the IPv4 receiving path (i.e.: * ip_rcv_core(...)). */ IP6CB(skb)->iif = skb->skb_iif; } hdr->nexthdr = NEXTHDR_ROUTING; isrh = (void *)hdr + sizeof(*hdr); memcpy(isrh, osrh, hdrlen); isrh->nexthdr = proto; hdr->daddr = isrh->segments[isrh->first_segment]; set_tun_src(net, dev, &hdr->daddr, &hdr->saddr); #ifdef CONFIG_IPV6_SEG6_HMAC if (sr_has_hmac(isrh)) { err = seg6_push_hmac(net, &hdr->saddr, isrh); if (unlikely(err)) return err; } #endif hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); skb_postpush_rcsum(skb, hdr, tot_len); return 0; } /* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) { return __seg6_do_srh_encap(skb, osrh, proto, NULL); } EXPORT_SYMBOL_GPL(seg6_do_srh_encap); /* encapsulate an IPv6 packet within an outer IPv6 header with reduced SRH */ static int seg6_do_srh_encap_red(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto, struct dst_entry *cache_dst) { __u8 first_seg = osrh->first_segment; struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst_dev(dst); struct net *net = dev_net(dev); struct ipv6hdr *hdr, *inner_hdr; int hdrlen = ipv6_optlen(osrh); int red_tlv_offset, tlv_offset; struct ipv6_sr_hdr *isrh; bool skip_srh = false; __be32 flowlabel; int tot_len, err; int red_hdrlen; int tlvs_len; if (first_seg > 0) { red_hdrlen = hdrlen - sizeof(struct in6_addr); } else { /* NOTE: if tag/flags and/or other TLVs are introduced in the * seg6_iptunnel infrastructure, they should be considered when * deciding to skip the SRH. */ skip_srh = !sr_has_hmac(osrh); red_hdrlen = skip_srh ? 0 : hdrlen; } tot_len = red_hdrlen + sizeof(struct ipv6hdr); err = skb_cow_head(skb, tot_len + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; inner_hdr = ipv6_hdr(skb); flowlabel = seg6_make_flowlabel(net, skb, inner_hdr); skb_push(skb, tot_len); skb_reset_network_header(skb); skb_mac_header_rebuild(skb); hdr = ipv6_hdr(skb); /* based on seg6_do_srh_encap() */ if (skb->protocol == htons(ETH_P_IPV6)) { ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)), flowlabel); hdr->hop_limit = inner_hdr->hop_limit; } else { ip6_flow_hdr(hdr, 0, flowlabel); hdr->hop_limit = ip6_dst_hoplimit(skb_dst(skb)); memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); IP6CB(skb)->iif = skb->skb_iif; } /* no matter if we have to skip the SRH or not, the first segment * always comes in the pushed IPv6 header. */ hdr->daddr = osrh->segments[first_seg]; if (skip_srh) { hdr->nexthdr = proto; set_tun_src(net, dev, &hdr->daddr, &hdr->saddr); goto out; } /* we cannot skip the SRH, slow path */ hdr->nexthdr = NEXTHDR_ROUTING; isrh = (void *)hdr + sizeof(struct ipv6hdr); if (unlikely(!first_seg)) { /* this is a very rare case; we have only one SID but * we cannot skip the SRH since we are carrying some * other info. */ memcpy(isrh, osrh, hdrlen); goto srcaddr; } tlv_offset = sizeof(*osrh) + (first_seg + 1) * sizeof(struct in6_addr); red_tlv_offset = tlv_offset - sizeof(struct in6_addr); memcpy(isrh, osrh, red_tlv_offset); tlvs_len = hdrlen - tlv_offset; if (unlikely(tlvs_len > 0)) { const void *s = (const void *)osrh + tlv_offset; void *d = (void *)isrh + red_tlv_offset; memcpy(d, s, tlvs_len); } --isrh->first_segment; isrh->hdrlen -= 2; srcaddr: isrh->nexthdr = proto; set_tun_src(net, dev, &hdr->daddr, &hdr->saddr); #ifdef CONFIG_IPV6_SEG6_HMAC if (unlikely(!skip_srh && sr_has_hmac(isrh))) { err = seg6_push_hmac(net, &hdr->saddr, isrh); if (unlikely(err)) return err; } #endif out: hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); skb_postpush_rcsum(skb, hdr, tot_len); return 0; } static int __seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, struct dst_entry *cache_dst) { struct ipv6hdr *hdr, *oldhdr; struct ipv6_sr_hdr *isrh; int hdrlen, err; hdrlen = (osrh->hdrlen + 1) << 3; err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; oldhdr = ipv6_hdr(skb); skb_pull(skb, sizeof(struct ipv6hdr)); skb_postpull_rcsum(skb, skb_network_header(skb), sizeof(struct ipv6hdr)); skb_push(skb, sizeof(struct ipv6hdr) + hdrlen); skb_reset_network_header(skb); skb_mac_header_rebuild(skb); hdr = ipv6_hdr(skb); memmove(hdr, oldhdr, sizeof(*hdr)); isrh = (void *)hdr + sizeof(*hdr); memcpy(isrh, osrh, hdrlen); isrh->nexthdr = hdr->nexthdr; hdr->nexthdr = NEXTHDR_ROUTING; isrh->segments[0] = hdr->daddr; hdr->daddr = isrh->segments[isrh->first_segment]; #ifdef CONFIG_IPV6_SEG6_HMAC if (sr_has_hmac(isrh)) { struct net *net = skb_dst_dev_net(skb); err = seg6_push_hmac(net, &hdr->saddr, isrh); if (unlikely(err)) return err; } #endif hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); skb_postpush_rcsum(skb, hdr, sizeof(struct ipv6hdr) + hdrlen); return 0; } static int seg6_do_srh(struct sk_buff *skb, struct dst_entry *cache_dst) { struct dst_entry *dst = skb_dst(skb); struct seg6_iptunnel_encap *tinfo; int proto, err = 0; tinfo = seg6_encap_lwtunnel(dst->lwtstate); switch (tinfo->mode) { case SEG6_IPTUN_MODE_INLINE: if (skb->protocol != htons(ETH_P_IPV6)) return -EINVAL; err = __seg6_do_srh_inline(skb, tinfo->srh, cache_dst); if (err) return err; break; case SEG6_IPTUN_MODE_ENCAP: case SEG6_IPTUN_MODE_ENCAP_RED: err = iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6); if (err) return err; if (skb->protocol == htons(ETH_P_IPV6)) proto = IPPROTO_IPV6; else if (skb->protocol == htons(ETH_P_IP)) proto = IPPROTO_IPIP; else return -EINVAL; if (tinfo->mode == SEG6_IPTUN_MODE_ENCAP) err = __seg6_do_srh_encap(skb, tinfo->srh, proto, cache_dst); else err = seg6_do_srh_encap_red(skb, tinfo->srh, proto, cache_dst); if (err) return err; skb_set_inner_transport_header(skb, skb_transport_offset(skb)); skb_set_inner_protocol(skb, skb->protocol); skb->protocol = htons(ETH_P_IPV6); break; case SEG6_IPTUN_MODE_L2ENCAP: case SEG6_IPTUN_MODE_L2ENCAP_RED: if (!skb_mac_header_was_set(skb)) return -EINVAL; if (pskb_expand_head(skb, skb->mac_len, 0, GFP_ATOMIC) < 0) return -ENOMEM; skb_mac_header_rebuild(skb); skb_push(skb, skb->mac_len); if (tinfo->mode == SEG6_IPTUN_MODE_L2ENCAP) err = __seg6_do_srh_encap(skb, tinfo->srh, IPPROTO_ETHERNET, cache_dst); else err = seg6_do_srh_encap_red(skb, tinfo->srh, IPPROTO_ETHERNET, cache_dst); if (err) return err; skb->protocol = htons(ETH_P_IPV6); break; } skb_set_transport_header(skb, sizeof(struct ipv6hdr)); nf_reset_ct(skb); return 0; } /* insert an SRH within an IPv6 packet, just after the IPv6 header */ int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) { return __seg6_do_srh_inline(skb, osrh, NULL); } EXPORT_SYMBOL_GPL(seg6_do_srh_inline); static int seg6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { return dst_input(skb); } static int seg6_input_core(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; struct lwtunnel_state *lwtst; struct seg6_lwt *slwt; int err; /* We cannot dereference "orig_dst" once ip6_route_input() or * skb_dst_drop() is called. However, in order to detect a dst loop, we * need the address of its lwtstate. So, save the address of lwtstate * now and use it later as a comparison. */ lwtst = orig_dst->lwtstate; slwt = seg6_lwt_lwtunnel(lwtst); local_bh_disable(); dst = dst_cache_get(&slwt->cache); local_bh_enable(); err = seg6_do_srh(skb, dst); if (unlikely(err)) { dst_release(dst); goto drop; } if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); /* cache only if we don't create a dst reference loop */ if (!dst->error && lwtst != dst->lwtstate) { local_bh_disable(); dst_cache_set_ip6(&slwt->cache, dst, &ipv6_hdr(skb)->saddr); local_bh_enable(); } err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst))); if (unlikely(err)) goto drop; } else { skb_dst_drop(skb); skb_dst_set(skb, dst); } if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, dev_net(skb->dev), NULL, skb, NULL, skb_dst_dev(skb), seg6_input_finish); return seg6_input_finish(dev_net(skb->dev), NULL, skb); drop: kfree_skb(skb); return err; } static int seg6_input_nf(struct sk_buff *skb) { struct net_device *dev = skb_dst_dev(skb); struct net *net = dev_net(skb->dev); switch (skb->protocol) { case htons(ETH_P_IP): return NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, NULL, skb, NULL, dev, seg6_input_core); case htons(ETH_P_IPV6): return NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, NULL, skb, NULL, dev, seg6_input_core); } return -EINVAL; } static int seg6_input(struct sk_buff *skb) { if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return seg6_input_nf(skb); return seg6_input_core(dev_net(skb->dev), NULL, skb); } static int seg6_output_core(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; struct seg6_lwt *slwt; int err; slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate); local_bh_disable(); dst = dst_cache_get(&slwt->cache); local_bh_enable(); err = seg6_do_srh(skb, dst); if (unlikely(err)) goto drop; if (unlikely(!dst)) { struct ipv6hdr *hdr = ipv6_hdr(skb); struct flowi6 fl6; memset(&fl6, 0, sizeof(fl6)); fl6.daddr = hdr->daddr; fl6.saddr = hdr->saddr; fl6.flowlabel = ip6_flowinfo(hdr); fl6.flowi6_mark = skb->mark; fl6.flowi6_proto = hdr->nexthdr; dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { err = dst->error; goto drop; } /* cache only if we don't create a dst reference loop */ if (orig_dst->lwtstate != dst->lwtstate) { local_bh_disable(); dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr); local_bh_enable(); } err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst))); if (unlikely(err)) goto drop; } skb_dst_drop(skb); skb_dst_set(skb, dst); if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, dst_dev(dst), dst_output); return dst_output(net, sk, skb); drop: dst_release(dst); kfree_skb(skb); return err; } static int seg6_output_nf(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst_dev(skb); switch (skb->protocol) { case htons(ETH_P_IP): return NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, NULL, dev, seg6_output_core); case htons(ETH_P_IPV6): return NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb, NULL, dev, seg6_output_core); } return -EINVAL; } static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return seg6_output_nf(net, sk, skb); return seg6_output_core(net, sk, skb); } static int seg6_build_state(struct net *net, struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { struct nlattr *tb[SEG6_IPTUNNEL_MAX + 1]; struct seg6_iptunnel_encap *tuninfo; struct lwtunnel_state *newts; int tuninfo_len, min_size; struct seg6_lwt *slwt; int err; if (family != AF_INET && family != AF_INET6) return -EINVAL; err = nla_parse_nested_deprecated(tb, SEG6_IPTUNNEL_MAX, nla, seg6_iptunnel_policy, extack); if (err < 0) return err; if (!tb[SEG6_IPTUNNEL_SRH]) return -EINVAL; tuninfo = nla_data(tb[SEG6_IPTUNNEL_SRH]); tuninfo_len = nla_len(tb[SEG6_IPTUNNEL_SRH]); /* tuninfo must contain at least the iptunnel encap structure, * the SRH and one segment */ min_size = sizeof(*tuninfo) + sizeof(struct ipv6_sr_hdr) + sizeof(struct in6_addr); if (tuninfo_len < min_size) return -EINVAL; switch (tuninfo->mode) { case SEG6_IPTUN_MODE_INLINE: if (family != AF_INET6) return -EINVAL; break; case SEG6_IPTUN_MODE_ENCAP: break; case SEG6_IPTUN_MODE_L2ENCAP: break; case SEG6_IPTUN_MODE_ENCAP_RED: break; case SEG6_IPTUN_MODE_L2ENCAP_RED: break; default: return -EINVAL; } /* verify that SRH is consistent */ if (!seg6_validate_srh(tuninfo->srh, tuninfo_len - sizeof(*tuninfo), false)) return -EINVAL; newts = lwtunnel_state_alloc(tuninfo_len + sizeof(*slwt)); if (!newts) return -ENOMEM; slwt = seg6_lwt_lwtunnel(newts); err = dst_cache_init(&slwt->cache, GFP_ATOMIC); if (err) { kfree(newts); return err; } memcpy(&slwt->tuninfo, tuninfo, tuninfo_len); newts->type = LWTUNNEL_ENCAP_SEG6; newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; if (tuninfo->mode != SEG6_IPTUN_MODE_L2ENCAP) newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; newts->headroom = seg6_lwt_headroom(tuninfo); *ts = newts; return 0; } static void seg6_destroy_state(struct lwtunnel_state *lwt) { dst_cache_destroy(&seg6_lwt_lwtunnel(lwt)->cache); } static int seg6_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { struct seg6_iptunnel_encap *tuninfo = seg6_encap_lwtunnel(lwtstate); if (nla_put_srh(skb, SEG6_IPTUNNEL_SRH, tuninfo)) return -EMSGSIZE; return 0; } static int seg6_encap_nlsize(struct lwtunnel_state *lwtstate) { struct seg6_iptunnel_encap *tuninfo = seg6_encap_lwtunnel(lwtstate); return nla_total_size(SEG6_IPTUN_ENCAP_SIZE(tuninfo)); } static int seg6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) { struct seg6_iptunnel_encap *a_hdr = seg6_encap_lwtunnel(a); struct seg6_iptunnel_encap *b_hdr = seg6_encap_lwtunnel(b); int len = SEG6_IPTUN_ENCAP_SIZE(a_hdr); if (len != SEG6_IPTUN_ENCAP_SIZE(b_hdr)) return 1; return memcmp(a_hdr, b_hdr, len); } static const struct lwtunnel_encap_ops seg6_iptun_ops = { .build_state = seg6_build_state, .destroy_state = seg6_destroy_state, .output = seg6_output, .input = seg6_input, .fill_encap = seg6_fill_encap_info, .get_encap_size = seg6_encap_nlsize, .cmp_encap = seg6_encap_cmp, .owner = THIS_MODULE, }; int __init seg6_iptunnel_init(void) { return lwtunnel_encap_add_ops(&seg6_iptun_ops, LWTUNNEL_ENCAP_SEG6); } void seg6_iptunnel_exit(void) { lwtunnel_encap_del_ops(&seg6_iptun_ops, LWTUNNEL_ENCAP_SEG6); }
3 3 3 1 1 1 1 3 3 3 2 2 2 36 36 36 14 9 4 13 15 21 12 11 12 3 3 3 3 3 14 5 5 14 13 29 30 30 30 13 20 13 18 24 24 23 23 23 4 23 23 23 23 18 5 23 23 18 18 18 16 18 17 18 21 1 25 24 22 18 1 7 4 3 1 7 7 7 24 20 5 6 22 12 2 180 36 177 189 188 189 165 33 33 33 189 75 75 157 158 2 2 98 98 194 194 194 168 36 194 25 34 1 194 174 172 172 172 33 172 167 167 167 33 167 39 39 39 39 13 13 13 39 35 35 35 35 34 1 1 1 35 15 15 15 15 15 15 15 15 15 15 15 14 13 13 13 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk> * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/blkdev.h> #include <linux/blktrace_api.h> #include <linux/percpu.h> #include <linux/init.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/debugfs.h> #include <linux/export.h> #include <linux/time.h> #include <linux/uaccess.h> #include <linux/list.h> #include <linux/blk-cgroup.h> #include "../../block/blk.h" #include <trace/events/block.h> #include "trace_output.h" #ifdef CONFIG_BLK_DEV_IO_TRACE static unsigned int blktrace_seq __read_mostly = 1; static struct trace_array *blk_tr; static bool blk_tracer_enabled __read_mostly; static LIST_HEAD(running_trace_list); static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(running_trace_lock); /* Select an alternative, minimalistic output than the original one */ #define TRACE_BLK_OPT_CLASSIC 0x1 #define TRACE_BLK_OPT_CGROUP 0x2 #define TRACE_BLK_OPT_CGNAME 0x4 static struct tracer_opt blk_tracer_opts[] = { /* Default disable the minimalistic output */ { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) }, #ifdef CONFIG_BLK_CGROUP { TRACER_OPT(blk_cgroup, TRACE_BLK_OPT_CGROUP) }, { TRACER_OPT(blk_cgname, TRACE_BLK_OPT_CGNAME) }, #endif { } }; static struct tracer_flags blk_tracer_flags = { .val = 0, .opts = blk_tracer_opts, }; /* Global reference count of probes */ static DEFINE_MUTEX(blk_probe_mutex); static int blk_probes_ref; static void blk_register_tracepoints(void); static void blk_unregister_tracepoints(void); /* * Send out a notify message. */ static void trace_note(struct blk_trace *bt, pid_t pid, int action, const void *data, size_t len, u64 cgid) { struct blk_io_trace *t; struct ring_buffer_event *event = NULL; struct trace_buffer *buffer = NULL; unsigned int trace_ctx = 0; int cpu = smp_processor_id(); bool blk_tracer = blk_tracer_enabled; ssize_t cgid_len = cgid ? sizeof(cgid) : 0; if (blk_tracer) { buffer = blk_tr->array_buffer.buffer; trace_ctx = tracing_gen_ctx_flags(0); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, sizeof(*t) + len + cgid_len, trace_ctx); if (!event) return; t = ring_buffer_event_data(event); goto record_it; } if (!bt->rchan) return; t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len); if (t) { t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; t->time = ktime_to_ns(ktime_get()); record_it: t->device = bt->dev; t->action = action | (cgid ? __BLK_TN_CGROUP : 0); t->pid = pid; t->cpu = cpu; t->pdu_len = len + cgid_len; if (cgid_len) memcpy((void *)t + sizeof(*t), &cgid, cgid_len); memcpy((void *) t + sizeof(*t) + cgid_len, data, len); if (blk_tracer) trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); } } /* * Send out a notify for this process, if we haven't done so since a trace * started */ static void trace_note_tsk(struct task_struct *tsk) { unsigned long flags; struct blk_trace *bt; tsk->btrace_seq = blktrace_seq; raw_spin_lock_irqsave(&running_trace_lock, flags); list_for_each_entry(bt, &running_trace_list, running_list) { trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm), 0); } raw_spin_unlock_irqrestore(&running_trace_lock, flags); } static void trace_note_time(struct blk_trace *bt) { struct timespec64 now; unsigned long flags; u32 words[2]; /* need to check user space to see if this breaks in y2038 or y2106 */ ktime_get_real_ts64(&now); words[0] = (u32)now.tv_sec; words[1] = now.tv_nsec; local_irq_save(flags); trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), 0); local_irq_restore(flags); } void __blk_trace_note_message(struct blk_trace *bt, struct cgroup_subsys_state *css, const char *fmt, ...) { int n; va_list args; unsigned long flags; char *buf; u64 cgid = 0; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer_enabled)) return; /* * If the BLK_TC_NOTIFY action mask isn't set, don't send any note * message to the trace. */ if (!(bt->act_mask & BLK_TC_NOTIFY)) return; local_irq_save(flags); buf = this_cpu_ptr(bt->msg_data); va_start(args, fmt); n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); va_end(args); #ifdef CONFIG_BLK_CGROUP if (css && (blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) cgid = cgroup_id(css->cgroup); else cgid = 1; #endif trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, cgid); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(__blk_trace_note_message); static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, pid_t pid) { if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) return 1; if (sector && (sector < bt->start_lba || sector > bt->end_lba)) return 1; if (bt->pid && pid != bt->pid) return 1; return 0; } /* * Data direction bit lookup */ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) }; #define BLK_TC_RAHEAD BLK_TC_AHEAD #define BLK_TC_PREFLUSH BLK_TC_FLUSH /* The ilog2() calls fall out because they're constant */ #define MASK_TC_BIT(rw, __name) ((__force u32)(rw & REQ_ ## __name) << \ (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name)) /* * The worker for the various blk_add_trace*() types. Fills out a * blk_io_trace structure and places it in a per-cpu subbuffer. */ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, const blk_opf_t opf, u32 what, int error, int pdu_len, void *pdu_data, u64 cgid) { struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; struct trace_buffer *buffer = NULL; struct blk_io_trace *t; unsigned long flags = 0; unsigned long *sequence; unsigned int trace_ctx = 0; pid_t pid; int cpu; bool blk_tracer = blk_tracer_enabled; ssize_t cgid_len = cgid ? sizeof(cgid) : 0; const enum req_op op = opf & REQ_OP_MASK; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) return; what |= ddir_act[op_is_write(op) ? WRITE : READ]; what |= MASK_TC_BIT(opf, SYNC); what |= MASK_TC_BIT(opf, RAHEAD); what |= MASK_TC_BIT(opf, META); what |= MASK_TC_BIT(opf, PREFLUSH); what |= MASK_TC_BIT(opf, FUA); if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE) what |= BLK_TC_ACT(BLK_TC_DISCARD); if (op == REQ_OP_FLUSH) what |= BLK_TC_ACT(BLK_TC_FLUSH); if (cgid) what |= __BLK_TA_CGROUP; pid = tsk->pid; if (act_log_check(bt, what, sector, pid)) return; cpu = raw_smp_processor_id(); if (blk_tracer) { tracing_record_cmdline(current); buffer = blk_tr->array_buffer.buffer; trace_ctx = tracing_gen_ctx_flags(0); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, sizeof(*t) + pdu_len + cgid_len, trace_ctx); if (!event) return; t = ring_buffer_event_data(event); goto record_it; } if (unlikely(tsk->btrace_seq != blktrace_seq)) trace_note_tsk(tsk); /* * A word about the locking here - we disable interrupts to reserve * some space in the relay per-cpu buffer, to prevent an irq * from coming in and stepping on our toes. */ local_irq_save(flags); t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len); if (t) { sequence = per_cpu_ptr(bt->sequence, cpu); t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; t->sequence = ++(*sequence); t->time = ktime_to_ns(ktime_get()); record_it: /* * These two are not needed in ftrace as they are in the * generic trace_entry, filled by tracing_generic_entry_update, * but for the trace_event->bin() synthesizer benefit we do it * here too. */ t->cpu = cpu; t->pid = pid; t->sector = sector; t->bytes = bytes; t->action = what; t->device = bt->dev; t->error = error; t->pdu_len = pdu_len + cgid_len; if (cgid_len) memcpy((void *)t + sizeof(*t), &cgid, cgid_len); if (pdu_len) memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); if (blk_tracer) { trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); return; } } local_irq_restore(flags); } static void blk_trace_free(struct request_queue *q, struct blk_trace *bt) { relay_close(bt->rchan); /* * If 'bt->dir' is not set, then both 'dropped' and 'msg' are created * under 'q->debugfs_dir', thus lookup and remove them. */ if (!bt->dir) { debugfs_lookup_and_remove("dropped", q->debugfs_dir); debugfs_lookup_and_remove("msg", q->debugfs_dir); } else { debugfs_remove(bt->dir); } free_percpu(bt->sequence); free_percpu(bt->msg_data); kfree(bt); } static void get_probe_ref(void) { mutex_lock(&blk_probe_mutex); if (++blk_probes_ref == 1) blk_register_tracepoints(); mutex_unlock(&blk_probe_mutex); } static void put_probe_ref(void) { mutex_lock(&blk_probe_mutex); if (!--blk_probes_ref) blk_unregister_tracepoints(); mutex_unlock(&blk_probe_mutex); } static int blk_trace_start(struct blk_trace *bt) { if (bt->trace_state != Blktrace_setup && bt->trace_state != Blktrace_stopped) return -EINVAL; blktrace_seq++; smp_mb(); bt->trace_state = Blktrace_running; raw_spin_lock_irq(&running_trace_lock); list_add(&bt->running_list, &running_trace_list); raw_spin_unlock_irq(&running_trace_lock); trace_note_time(bt); return 0; } static int blk_trace_stop(struct blk_trace *bt) { if (bt->trace_state != Blktrace_running) return -EINVAL; bt->trace_state = Blktrace_stopped; raw_spin_lock_irq(&running_trace_lock); list_del_init(&bt->running_list); raw_spin_unlock_irq(&running_trace_lock); relay_flush(bt->rchan); return 0; } static void blk_trace_cleanup(struct request_queue *q, struct blk_trace *bt) { blk_trace_stop(bt); synchronize_rcu(); blk_trace_free(q, bt); put_probe_ref(); } static int __blk_trace_remove(struct request_queue *q) { struct blk_trace *bt; bt = rcu_replace_pointer(q->blk_trace, NULL, lockdep_is_held(&q->debugfs_mutex)); if (!bt) return -EINVAL; blk_trace_cleanup(q, bt); return 0; } int blk_trace_remove(struct request_queue *q) { int ret; mutex_lock(&q->debugfs_mutex); ret = __blk_trace_remove(q); mutex_unlock(&q->debugfs_mutex); return ret; } EXPORT_SYMBOL_GPL(blk_trace_remove); static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, size_t count, loff_t *ppos) { struct blk_trace *bt = filp->private_data; size_t dropped = relay_stats(bt->rchan, RELAY_STATS_BUF_FULL); char buf[16]; snprintf(buf, sizeof(buf), "%zu\n", dropped); return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf)); } static const struct file_operations blk_dropped_fops = { .owner = THIS_MODULE, .open = simple_open, .read = blk_dropped_read, .llseek = default_llseek, }; static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, size_t count, loff_t *ppos) { char *msg; struct blk_trace *bt; if (count >= BLK_TN_MAX_MSG) return -EINVAL; msg = memdup_user_nul(buffer, count); if (IS_ERR(msg)) return PTR_ERR(msg); bt = filp->private_data; __blk_trace_note_message(bt, NULL, "%s", msg); kfree(msg); return count; } static const struct file_operations blk_msg_fops = { .owner = THIS_MODULE, .open = simple_open, .write = blk_msg_write, .llseek = noop_llseek, }; static int blk_remove_buf_file_callback(struct dentry *dentry) { debugfs_remove(dentry); return 0; } static struct dentry *blk_create_buf_file_callback(const char *filename, struct dentry *parent, umode_t mode, struct rchan_buf *buf, int *is_global) { return debugfs_create_file(filename, mode, parent, buf, &relay_file_operations); } static const struct rchan_callbacks blk_relay_callbacks = { .create_buf_file = blk_create_buf_file_callback, .remove_buf_file = blk_remove_buf_file_callback, }; static void blk_trace_setup_lba(struct blk_trace *bt, struct block_device *bdev) { if (bdev) { bt->start_lba = bdev->bd_start_sect; bt->end_lba = bdev->bd_start_sect + bdev_nr_sectors(bdev); } else { bt->start_lba = 0; bt->end_lba = -1ULL; } } /* * Setup everything required to start tracing */ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, struct blk_user_trace_setup *buts) { struct blk_trace *bt = NULL; struct dentry *dir = NULL; int ret; lockdep_assert_held(&q->debugfs_mutex); if (!buts->buf_size || !buts->buf_nr) return -EINVAL; strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE); /* * some device names have larger paths - convert the slashes * to underscores for this to work as expected */ strreplace(buts->name, '/', '_'); /* * bdev can be NULL, as with scsi-generic, this is a helpful as * we can be. */ if (rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex))) { pr_warn("Concurrent blktraces are not allowed on %s\n", buts->name); return -EBUSY; } bt = kzalloc(sizeof(*bt), GFP_KERNEL); if (!bt) return -ENOMEM; ret = -ENOMEM; bt->sequence = alloc_percpu(unsigned long); if (!bt->sequence) goto err; bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char)); if (!bt->msg_data) goto err; /* * When tracing the whole disk reuse the existing debugfs directory * created by the block layer on init. For partitions block devices, * and scsi-generic block devices we create a temporary new debugfs * directory that will be removed once the trace ends. */ if (bdev && !bdev_is_partition(bdev)) dir = q->debugfs_dir; else bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); /* * As blktrace relies on debugfs for its interface the debugfs directory * is required, contrary to the usual mantra of not checking for debugfs * files or directories. */ if (IS_ERR_OR_NULL(dir)) { pr_warn("debugfs_dir not present for %s so skipping\n", buts->name); ret = -ENOENT; goto err; } bt->dev = dev; INIT_LIST_HEAD(&bt->running_list); ret = -EIO; debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops); debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); bt->rchan = relay_open("trace", dir, buts->buf_size, buts->buf_nr, &blk_relay_callbacks, bt); if (!bt->rchan) goto err; bt->act_mask = buts->act_mask; if (!bt->act_mask) bt->act_mask = (u16) -1; blk_trace_setup_lba(bt, bdev); /* overwrite with user settings */ if (buts->start_lba) bt->start_lba = buts->start_lba; if (buts->end_lba) bt->end_lba = buts->end_lba; bt->pid = buts->pid; bt->trace_state = Blktrace_setup; rcu_assign_pointer(q->blk_trace, bt); get_probe_ref(); ret = 0; err: if (ret) blk_trace_free(q, bt); return ret; } int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, char __user *arg) { struct blk_user_trace_setup buts; int ret; ret = copy_from_user(&buts, arg, sizeof(buts)); if (ret) return -EFAULT; mutex_lock(&q->debugfs_mutex); ret = do_blk_trace_setup(q, name, dev, bdev, &buts); mutex_unlock(&q->debugfs_mutex); if (ret) return ret; if (copy_to_user(arg, &buts, sizeof(buts))) { blk_trace_remove(q); return -EFAULT; } return 0; } EXPORT_SYMBOL_GPL(blk_trace_setup); #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) static int compat_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, char __user *arg) { struct blk_user_trace_setup buts; struct compat_blk_user_trace_setup cbuts; int ret; if (copy_from_user(&cbuts, arg, sizeof(cbuts))) return -EFAULT; buts = (struct blk_user_trace_setup) { .act_mask = cbuts.act_mask, .buf_size = cbuts.buf_size, .buf_nr = cbuts.buf_nr, .start_lba = cbuts.start_lba, .end_lba = cbuts.end_lba, .pid = cbuts.pid, }; mutex_lock(&q->debugfs_mutex); ret = do_blk_trace_setup(q, name, dev, bdev, &buts); mutex_unlock(&q->debugfs_mutex); if (ret) return ret; if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) { blk_trace_remove(q); return -EFAULT; } return 0; } #endif static int __blk_trace_startstop(struct request_queue *q, int start) { struct blk_trace *bt; bt = rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex)); if (bt == NULL) return -EINVAL; if (start) return blk_trace_start(bt); else return blk_trace_stop(bt); } int blk_trace_startstop(struct request_queue *q, int start) { int ret; mutex_lock(&q->debugfs_mutex); ret = __blk_trace_startstop(q, start); mutex_unlock(&q->debugfs_mutex); return ret; } EXPORT_SYMBOL_GPL(blk_trace_startstop); /* * When reading or writing the blktrace sysfs files, the references to the * opened sysfs or device files should prevent the underlying block device * from being removed. So no further delete protection is really needed. */ /** * blk_trace_ioctl - handle the ioctls associated with tracing * @bdev: the block device * @cmd: the ioctl cmd * @arg: the argument data, if any * **/ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) { struct request_queue *q = bdev_get_queue(bdev); int ret, start = 0; char b[BDEVNAME_SIZE]; switch (cmd) { case BLKTRACESETUP: snprintf(b, sizeof(b), "%pg", bdev); ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) case BLKTRACESETUP32: snprintf(b, sizeof(b), "%pg", bdev); ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; #endif case BLKTRACESTART: start = 1; fallthrough; case BLKTRACESTOP: ret = blk_trace_startstop(q, start); break; case BLKTRACETEARDOWN: ret = blk_trace_remove(q); break; default: ret = -ENOTTY; break; } return ret; } /** * blk_trace_shutdown - stop and cleanup trace structures * @q: the request queue associated with the device * **/ void blk_trace_shutdown(struct request_queue *q) { if (rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex))) __blk_trace_remove(q); } #ifdef CONFIG_BLK_CGROUP static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { struct cgroup_subsys_state *blkcg_css; struct blk_trace *bt; /* We don't use the 'bt' value here except as an optimization... */ bt = rcu_dereference_protected(q->blk_trace, 1); if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) return 0; blkcg_css = bio_blkcg_css(bio); if (!blkcg_css) return 0; return cgroup_id(blkcg_css->cgroup); } #else static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { return 0; } #endif static u64 blk_trace_request_get_cgid(struct request *rq) { if (!rq->bio) return 0; /* Use the first bio */ return blk_trace_bio_get_cgid(rq->q, rq->bio); } /* * blktrace probes */ /** * blk_add_trace_rq - Add a trace for a request oriented action * @rq: the source request * @error: return status to log * @nr_bytes: number of completed bytes * @what: the action * @cgid: the cgroup info * * Description: * Records an action against a request. Will log the bio offset + size. * **/ static void blk_add_trace_rq(struct request *rq, blk_status_t error, unsigned int nr_bytes, u32 what, u64 cgid) { struct blk_trace *bt; rcu_read_lock(); bt = rcu_dereference(rq->q->blk_trace); if (likely(!bt)) { rcu_read_unlock(); return; } if (blk_rq_is_passthrough(rq)) what |= BLK_TC_ACT(BLK_TC_PC); else what |= BLK_TC_ACT(BLK_TC_FS); __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, rq->cmd_flags, what, blk_status_to_errno(error), 0, NULL, cgid); rcu_read_unlock(); } static void blk_add_trace_rq_insert(void *ignore, struct request *rq) { blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT, blk_trace_request_get_cgid(rq)); } static void blk_add_trace_rq_issue(void *ignore, struct request *rq) { blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE, blk_trace_request_get_cgid(rq)); } static void blk_add_trace_rq_merge(void *ignore, struct request *rq) { blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_BACKMERGE, blk_trace_request_get_cgid(rq)); } static void blk_add_trace_rq_requeue(void *ignore, struct request *rq) { blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE, blk_trace_request_get_cgid(rq)); } static void blk_add_trace_rq_complete(void *ignore, struct request *rq, blk_status_t error, unsigned int nr_bytes) { blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE, blk_trace_request_get_cgid(rq)); } /** * blk_add_trace_bio - Add a trace for a bio oriented action * @q: queue the io is for * @bio: the source bio * @what: the action * @error: error, if any * * Description: * Records an action against a bio. Will log the bio offset + size. * **/ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, u32 what, int error) { struct blk_trace *bt; rcu_read_lock(); bt = rcu_dereference(q->blk_trace); if (likely(!bt)) { rcu_read_unlock(); return; } __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio->bi_opf, what, error, 0, NULL, blk_trace_bio_get_cgid(q, bio)); rcu_read_unlock(); } static void blk_add_trace_bio_complete(void *ignore, struct request_queue *q, struct bio *bio) { blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, blk_status_to_errno(bio->bi_status)); } static void blk_add_trace_bio_backmerge(void *ignore, struct bio *bio) { blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BACKMERGE, 0); } static void blk_add_trace_bio_frontmerge(void *ignore, struct bio *bio) { blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_FRONTMERGE, 0); } static void blk_add_trace_bio_queue(void *ignore, struct bio *bio) { blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_QUEUE, 0); } static void blk_add_trace_getrq(void *ignore, struct bio *bio) { blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_GETRQ, 0); } static void blk_add_trace_plug(void *ignore, struct request_queue *q) { struct blk_trace *bt; rcu_read_lock(); bt = rcu_dereference(q->blk_trace); if (bt) __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0); rcu_read_unlock(); } static void blk_add_trace_unplug(void *ignore, struct request_queue *q, unsigned int depth, bool explicit) { struct blk_trace *bt; rcu_read_lock(); bt = rcu_dereference(q->blk_trace); if (bt) { __be64 rpdu = cpu_to_be64(depth); u32 what; if (explicit) what = BLK_TA_UNPLUG_IO; else what = BLK_TA_UNPLUG_TIMER; __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0); } rcu_read_unlock(); } static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu) { struct request_queue *q = bio->bi_bdev->bd_disk->queue; struct blk_trace *bt; rcu_read_lock(); bt = rcu_dereference(q->blk_trace); if (bt) { __be64 rpdu = cpu_to_be64(pdu); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio->bi_opf, BLK_TA_SPLIT, blk_status_to_errno(bio->bi_status), sizeof(rpdu), &rpdu, blk_trace_bio_get_cgid(q, bio)); } rcu_read_unlock(); } /** * blk_add_trace_bio_remap - Add a trace for a bio-remap operation * @ignore: trace callback data parameter (not used) * @bio: the source bio * @dev: source device * @from: source sector * * Called after a bio is remapped to a different device and/or sector. **/ static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev, sector_t from) { struct request_queue *q = bio->bi_bdev->bd_disk->queue; struct blk_trace *bt; struct blk_io_trace_remap r; rcu_read_lock(); bt = rcu_dereference(q->blk_trace); if (likely(!bt)) { rcu_read_unlock(); return; } r.device_from = cpu_to_be32(dev); r.device_to = cpu_to_be32(bio_dev(bio)); r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio->bi_opf, BLK_TA_REMAP, blk_status_to_errno(bio->bi_status), sizeof(r), &r, blk_trace_bio_get_cgid(q, bio)); rcu_read_unlock(); } /** * blk_add_trace_rq_remap - Add a trace for a request-remap operation * @ignore: trace callback data parameter (not used) * @rq: the source request * @dev: target device * @from: source sector * * Description: * Device mapper remaps request to other devices. * Add a trace for that action. * **/ static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev, sector_t from) { struct blk_trace *bt; struct blk_io_trace_remap r; rcu_read_lock(); bt = rcu_dereference(rq->q->blk_trace); if (likely(!bt)) { rcu_read_unlock(); return; } r.device_from = cpu_to_be32(dev); r.device_to = cpu_to_be32(disk_devt(rq->q->disk)); r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rq->cmd_flags, BLK_TA_REMAP, 0, sizeof(r), &r, blk_trace_request_get_cgid(rq)); rcu_read_unlock(); } /** * blk_add_driver_data - Add binary message with driver-specific data * @rq: io request * @data: driver-specific data * @len: length of driver-specific data * * Description: * Some drivers might want to write driver-specific data per request. * **/ void blk_add_driver_data(struct request *rq, void *data, size_t len) { struct blk_trace *bt; rcu_read_lock(); bt = rcu_dereference(rq->q->blk_trace); if (likely(!bt)) { rcu_read_unlock(); return; } __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, BLK_TA_DRV_DATA, 0, len, data, blk_trace_request_get_cgid(rq)); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(blk_add_driver_data); static void blk_register_tracepoints(void) { int ret; ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); WARN_ON(ret); ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); WARN_ON(ret); ret = register_trace_block_rq_merge(blk_add_trace_rq_merge, NULL); WARN_ON(ret); ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); WARN_ON(ret); ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); WARN_ON(ret); ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL); WARN_ON(ret); ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL); WARN_ON(ret); ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL); WARN_ON(ret); ret = register_trace_block_bio_queue(blk_add_trace_bio_queue, NULL); WARN_ON(ret); ret = register_trace_block_getrq(blk_add_trace_getrq, NULL); WARN_ON(ret); ret = register_trace_block_plug(blk_add_trace_plug, NULL); WARN_ON(ret); ret = register_trace_block_unplug(blk_add_trace_unplug, NULL); WARN_ON(ret); ret = register_trace_block_split(blk_add_trace_split, NULL); WARN_ON(ret); ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); WARN_ON(ret); ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); WARN_ON(ret); } static void blk_unregister_tracepoints(void) { unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); unregister_trace_block_split(blk_add_trace_split, NULL); unregister_trace_block_unplug(blk_add_trace_unplug, NULL); unregister_trace_block_plug(blk_add_trace_plug, NULL); unregister_trace_block_getrq(blk_add_trace_getrq, NULL); unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL); unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL); unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL); unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL); unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); unregister_trace_block_rq_merge(blk_add_trace_rq_merge, NULL); unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); tracepoint_synchronize_unregister(); } /* * struct blk_io_tracer formatting routines */ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) { int i = 0; int tc = t->action >> BLK_TC_SHIFT; if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) { rwbs[i++] = 'N'; goto out; } if (tc & BLK_TC_FLUSH) rwbs[i++] = 'F'; if (tc & BLK_TC_DISCARD) rwbs[i++] = 'D'; else if (tc & BLK_TC_WRITE) rwbs[i++] = 'W'; else if (t->bytes) rwbs[i++] = 'R'; else rwbs[i++] = 'N'; if (tc & BLK_TC_FUA) rwbs[i++] = 'F'; if (tc & BLK_TC_AHEAD) rwbs[i++] = 'A'; if (tc & BLK_TC_SYNC) rwbs[i++] = 'S'; if (tc & BLK_TC_META) rwbs[i++] = 'M'; out: rwbs[i] = '\0'; } static inline const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) { return (const struct blk_io_trace *)ent; } static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg) { return (void *)(te_blk_io_trace(ent) + 1) + (has_cg ? sizeof(u64) : 0); } static inline u64 t_cgid(const struct trace_entry *ent) { return *(u64 *)(te_blk_io_trace(ent) + 1); } static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg) { return te_blk_io_trace(ent)->pdu_len - (has_cg ? sizeof(u64) : 0); } static inline u32 t_action(const struct trace_entry *ent) { return te_blk_io_trace(ent)->action; } static inline u32 t_bytes(const struct trace_entry *ent) { return te_blk_io_trace(ent)->bytes; } static inline u32 t_sec(const struct trace_entry *ent) { return te_blk_io_trace(ent)->bytes >> 9; } static inline unsigned long long t_sector(const struct trace_entry *ent) { return te_blk_io_trace(ent)->sector; } static inline __u16 t_error(const struct trace_entry *ent) { return te_blk_io_trace(ent)->error; } static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg) { const __be64 *val = pdu_start(ent, has_cg); return be64_to_cpu(*val); } typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act, bool has_cg); static void blk_log_action_classic(struct trace_iterator *iter, const char *act, bool has_cg) { char rwbs[RWBS_LEN]; unsigned long long ts = iter->ts; unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC); unsigned secs = (unsigned long)ts; const struct blk_io_trace *t = te_blk_io_trace(iter->ent); fill_rwbs(rwbs, t); trace_seq_printf(&iter->seq, "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ", MAJOR(t->device), MINOR(t->device), iter->cpu, secs, nsec_rem, iter->ent->pid, act, rwbs); } static void blk_log_action(struct trace_iterator *iter, const char *act, bool has_cg) { char rwbs[RWBS_LEN]; const struct blk_io_trace *t = te_blk_io_trace(iter->ent); fill_rwbs(rwbs, t); if (has_cg) { u64 id = t_cgid(iter->ent); if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) { char blkcg_name_buf[NAME_MAX + 1] = "<...>"; cgroup_path_from_kernfs_id(id, blkcg_name_buf, sizeof(blkcg_name_buf)); trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ", MAJOR(t->device), MINOR(t->device), blkcg_name_buf, act, rwbs); } else { /* * The cgid portion used to be "INO,GEN". Userland * builds a FILEID_INO32_GEN fid out of them and * opens the cgroup using open_by_handle_at(2). * While 32bit ino setups are still the same, 64bit * ones now use the 64bit ino as the whole ID and * no longer use generation. * * Regardless of the content, always output * "LOW32,HIGH32" so that FILEID_INO32_GEN fid can * be mapped back to @id on both 64 and 32bit ino * setups. See __kernfs_fh_to_dentry(). */ trace_seq_printf(&iter->seq, "%3d,%-3d %llx,%-llx %2s %3s ", MAJOR(t->device), MINOR(t->device), id & U32_MAX, id >> 32, act, rwbs); } } else trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", MAJOR(t->device), MINOR(t->device), act, rwbs); } static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { const unsigned char *pdu_buf; int pdu_len; int i, end; pdu_buf = pdu_start(ent, has_cg); pdu_len = pdu_real_len(ent, has_cg); if (!pdu_len) return; /* find the last zero that needs to be printed */ for (end = pdu_len - 1; end >= 0; end--) if (pdu_buf[end]) break; end++; trace_seq_putc(s, '('); for (i = 0; i < pdu_len; i++) { trace_seq_printf(s, "%s%02x", i == 0 ? "" : " ", pdu_buf[i]); /* * stop when the rest is just zeros and indicate so * with a ".." appended */ if (i == end && end != pdu_len - 1) { trace_seq_puts(s, " ..) "); return; } } trace_seq_puts(s, ") "); } static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { trace_seq_printf(s, "%u ", t_bytes(ent)); blk_log_dump_pdu(s, ent, has_cg); trace_seq_printf(s, "[%s]\n", cmd); } else { if (t_sec(ent)) trace_seq_printf(s, "%llu + %u [%s]\n", t_sector(ent), t_sec(ent), cmd); else trace_seq_printf(s, "[%s]\n", cmd); } } static void blk_log_with_error(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { blk_log_dump_pdu(s, ent, has_cg); trace_seq_printf(s, "[%d]\n", t_error(ent)); } else { if (t_sec(ent)) trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent), t_sec(ent), t_error(ent)); else trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent)); } } static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg); trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", t_sector(ent), t_sec(ent), MAJOR(be32_to_cpu(__r->device_from)), MINOR(be32_to_cpu(__r->device_from)), be64_to_cpu(__r->sector_from)); } static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); trace_seq_printf(s, "[%s]\n", cmd); } static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent, has_cg)); } static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), get_pdu_int(ent, has_cg), cmd); } static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { trace_seq_putmem(s, pdu_start(ent, has_cg), pdu_real_len(ent, has_cg)); trace_seq_putc(s, '\n'); } /* * struct tracer operations */ static void blk_tracer_print_header(struct seq_file *m) { if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) return; seq_puts(m, "# DEV CPU TIMESTAMP PID ACT FLG\n" "# | | | | | |\n"); } static void blk_tracer_start(struct trace_array *tr) { blk_tracer_enabled = true; } static int blk_tracer_init(struct trace_array *tr) { blk_tr = tr; blk_tracer_start(tr); return 0; } static void blk_tracer_stop(struct trace_array *tr) { blk_tracer_enabled = false; } static void blk_tracer_reset(struct trace_array *tr) { blk_tracer_stop(tr); } static const struct { const char *act[2]; void (*print)(struct trace_seq *s, const struct trace_entry *ent, bool has_cg); } what2act[] = { [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, [__BLK_TA_FRONTMERGE] = {{ "F", "frontmerge" }, blk_log_generic }, [__BLK_TA_GETRQ] = {{ "G", "getrq" }, blk_log_generic }, [__BLK_TA_SLEEPRQ] = {{ "S", "sleeprq" }, blk_log_generic }, [__BLK_TA_REQUEUE] = {{ "R", "requeue" }, blk_log_with_error }, [__BLK_TA_ISSUE] = {{ "D", "issue" }, blk_log_generic }, [__BLK_TA_COMPLETE] = {{ "C", "complete" }, blk_log_with_error }, [__BLK_TA_PLUG] = {{ "P", "plug" }, blk_log_plug }, [__BLK_TA_UNPLUG_IO] = {{ "U", "unplug_io" }, blk_log_unplug }, [__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug }, [__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic }, [__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split }, [__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap }, }; static enum print_line_t print_one_line(struct trace_iterator *iter, bool classic) { struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; const struct blk_io_trace *t; u16 what; bool long_act; blk_log_action_t *log_action; bool has_cg; t = te_blk_io_trace(iter->ent); what = (t->action & ((1 << BLK_TC_SHIFT) - 1)) & ~__BLK_TA_CGROUP; long_act = !!(tr->trace_flags & TRACE_ITER_VERBOSE); log_action = classic ? &blk_log_action_classic : &blk_log_action; has_cg = t->action & __BLK_TA_CGROUP; if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) { log_action(iter, long_act ? "message" : "m", has_cg); blk_log_msg(s, iter->ent, has_cg); return trace_handle_return(s); } if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) trace_seq_printf(s, "Unknown action %x\n", what); else { log_action(iter, what2act[what].act[long_act], has_cg); what2act[what].print(s, iter->ent, has_cg); } return trace_handle_return(s); } static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, int flags, struct trace_event *event) { return print_one_line(iter, false); } static void blk_trace_synthesize_old_trace(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; const int offset = offsetof(struct blk_io_trace, sector); struct blk_io_trace old = { .magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION, .time = iter->ts, }; trace_seq_putmem(s, &old, offset); trace_seq_putmem(s, &t->sector, sizeof(old) - offset + t->pdu_len); } static enum print_line_t blk_trace_event_print_binary(struct trace_iterator *iter, int flags, struct trace_event *event) { blk_trace_synthesize_old_trace(iter); return trace_handle_return(&iter->seq); } static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) { if ((iter->ent->type != TRACE_BLK) || !(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) return TRACE_TYPE_UNHANDLED; return print_one_line(iter, true); } static int blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { /* don't output context-info for blk_classic output */ if (bit == TRACE_BLK_OPT_CLASSIC) { if (set) tr->trace_flags &= ~TRACE_ITER_CONTEXT_INFO; else tr->trace_flags |= TRACE_ITER_CONTEXT_INFO; } return 0; } static struct tracer blk_tracer __read_mostly = { .name = "blk", .init = blk_tracer_init, .reset = blk_tracer_reset, .start = blk_tracer_start, .stop = blk_tracer_stop, .print_header = blk_tracer_print_header, .print_line = blk_tracer_print_line, .flags = &blk_tracer_flags, .set_flag = blk_tracer_set_flag, }; static struct trace_event_functions trace_blk_event_funcs = { .trace = blk_trace_event_print, .binary = blk_trace_event_print_binary, }; static struct trace_event trace_blk_event = { .type = TRACE_BLK, .funcs = &trace_blk_event_funcs, }; static int __init init_blk_tracer(void) { if (!register_trace_event(&trace_blk_event)) { pr_warn("Warning: could not register block events\n"); return 1; } if (register_tracer(&blk_tracer) != 0) { pr_warn("Warning: could not register the block tracer\n"); unregister_trace_event(&trace_blk_event); return 1; } return 0; } device_initcall(init_blk_tracer); static int blk_trace_remove_queue(struct request_queue *q) { struct blk_trace *bt; bt = rcu_replace_pointer(q->blk_trace, NULL, lockdep_is_held(&q->debugfs_mutex)); if (bt == NULL) return -EINVAL; blk_trace_stop(bt); put_probe_ref(); synchronize_rcu(); blk_trace_free(q, bt); return 0; } /* * Setup everything required to start tracing */ static int blk_trace_setup_queue(struct request_queue *q, struct block_device *bdev) { struct blk_trace *bt = NULL; int ret = -ENOMEM; bt = kzalloc(sizeof(*bt), GFP_KERNEL); if (!bt) return -ENOMEM; bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char)); if (!bt->msg_data) goto free_bt; bt->dev = bdev->bd_dev; bt->act_mask = (u16)-1; blk_trace_setup_lba(bt, bdev); rcu_assign_pointer(q->blk_trace, bt); get_probe_ref(); return 0; free_bt: blk_trace_free(q, bt); return ret; } /* * sysfs interface to enable and configure tracing */ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct device_attribute *attr, char *buf); static ssize_t sysfs_blk_trace_attr_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count); #define BLK_TRACE_DEVICE_ATTR(_name) \ DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \ sysfs_blk_trace_attr_show, \ sysfs_blk_trace_attr_store) static BLK_TRACE_DEVICE_ATTR(enable); static BLK_TRACE_DEVICE_ATTR(act_mask); static BLK_TRACE_DEVICE_ATTR(pid); static BLK_TRACE_DEVICE_ATTR(start_lba); static BLK_TRACE_DEVICE_ATTR(end_lba); static struct attribute *blk_trace_attrs[] = { &dev_attr_enable.attr, &dev_attr_act_mask.attr, &dev_attr_pid.attr, &dev_attr_start_lba.attr, &dev_attr_end_lba.attr, NULL }; struct attribute_group blk_trace_attr_group = { .name = "trace", .attrs = blk_trace_attrs, }; static const struct { int mask; const char *str; } mask_maps[] = { { BLK_TC_READ, "read" }, { BLK_TC_WRITE, "write" }, { BLK_TC_FLUSH, "flush" }, { BLK_TC_SYNC, "sync" }, { BLK_TC_QUEUE, "queue" }, { BLK_TC_REQUEUE, "requeue" }, { BLK_TC_ISSUE, "issue" }, { BLK_TC_COMPLETE, "complete" }, { BLK_TC_FS, "fs" }, { BLK_TC_PC, "pc" }, { BLK_TC_NOTIFY, "notify" }, { BLK_TC_AHEAD, "ahead" }, { BLK_TC_META, "meta" }, { BLK_TC_DISCARD, "discard" }, { BLK_TC_DRV_DATA, "drv_data" }, { BLK_TC_FUA, "fua" }, }; static int blk_trace_str2mask(const char *str) { int i; int mask = 0; char *buf, *s, *token; buf = kstrdup(str, GFP_KERNEL); if (buf == NULL) return -ENOMEM; s = strstrip(buf); while (1) { token = strsep(&s, ","); if (token == NULL) break; if (*token == '\0') continue; for (i = 0; i < ARRAY_SIZE(mask_maps); i++) { if (strcasecmp(token, mask_maps[i].str) == 0) { mask |= mask_maps[i].mask; break; } } if (i == ARRAY_SIZE(mask_maps)) { mask = -EINVAL; break; } } kfree(buf); return mask; } static ssize_t blk_trace_mask2str(char *buf, int mask) { int i; char *p = buf; for (i = 0; i < ARRAY_SIZE(mask_maps); i++) { if (mask & mask_maps[i].mask) { p += sprintf(p, "%s%s", (p == buf) ? "" : ",", mask_maps[i].str); } } *p++ = '\n'; return p - buf; } static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct device_attribute *attr, char *buf) { struct block_device *bdev = dev_to_bdev(dev); struct request_queue *q = bdev_get_queue(bdev); struct blk_trace *bt; ssize_t ret = -ENXIO; mutex_lock(&q->debugfs_mutex); bt = rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex)); if (attr == &dev_attr_enable) { ret = sprintf(buf, "%u\n", !!bt); goto out_unlock_bdev; } if (bt == NULL) ret = sprintf(buf, "disabled\n"); else if (attr == &dev_attr_act_mask) ret = blk_trace_mask2str(buf, bt->act_mask); else if (attr == &dev_attr_pid) ret = sprintf(buf, "%u\n", bt->pid); else if (attr == &dev_attr_start_lba) ret = sprintf(buf, "%llu\n", bt->start_lba); else if (attr == &dev_attr_end_lba) ret = sprintf(buf, "%llu\n", bt->end_lba); out_unlock_bdev: mutex_unlock(&q->debugfs_mutex); return ret; } static ssize_t sysfs_blk_trace_attr_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct block_device *bdev = dev_to_bdev(dev); struct request_queue *q = bdev_get_queue(bdev); struct blk_trace *bt; u64 value; ssize_t ret = -EINVAL; if (count == 0) goto out; if (attr == &dev_attr_act_mask) { if (kstrtoull(buf, 0, &value)) { /* Assume it is a list of trace category names */ ret = blk_trace_str2mask(buf); if (ret < 0) goto out; value = ret; } } else { if (kstrtoull(buf, 0, &value)) goto out; } mutex_lock(&q->debugfs_mutex); bt = rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex)); if (attr == &dev_attr_enable) { if (!!value == !!bt) { ret = 0; goto out_unlock_bdev; } if (value) ret = blk_trace_setup_queue(q, bdev); else ret = blk_trace_remove_queue(q); goto out_unlock_bdev; } ret = 0; if (bt == NULL) { ret = blk_trace_setup_queue(q, bdev); bt = rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex)); } if (ret == 0) { if (attr == &dev_attr_act_mask) bt->act_mask = value; else if (attr == &dev_attr_pid) bt->pid = value; else if (attr == &dev_attr_start_lba) bt->start_lba = value; else if (attr == &dev_attr_end_lba) bt->end_lba = value; } out_unlock_bdev: mutex_unlock(&q->debugfs_mutex); out: return ret ? ret : count; } #endif /* CONFIG_BLK_DEV_IO_TRACE */ #ifdef CONFIG_EVENT_TRACING /** * blk_fill_rwbs - Fill the buffer rwbs by mapping op to character string. * @rwbs: buffer to be filled * @opf: request operation type (REQ_OP_XXX) and flags for the tracepoint * * Description: * Maps each request operation and flag to a single character and fills the * buffer provided by the caller with resulting string. * **/ void blk_fill_rwbs(char *rwbs, blk_opf_t opf) { int i = 0; if (opf & REQ_PREFLUSH) rwbs[i++] = 'F'; switch (opf & REQ_OP_MASK) { case REQ_OP_WRITE: rwbs[i++] = 'W'; break; case REQ_OP_DISCARD: rwbs[i++] = 'D'; break; case REQ_OP_SECURE_ERASE: rwbs[i++] = 'D'; rwbs[i++] = 'E'; break; case REQ_OP_FLUSH: rwbs[i++] = 'F'; break; case REQ_OP_READ: rwbs[i++] = 'R'; break; case REQ_OP_ZONE_APPEND: rwbs[i++] = 'Z'; rwbs[i++] = 'A'; break; case REQ_OP_ZONE_RESET: case REQ_OP_ZONE_RESET_ALL: rwbs[i++] = 'Z'; rwbs[i++] = 'R'; if ((opf & REQ_OP_MASK) == REQ_OP_ZONE_RESET_ALL) rwbs[i++] = 'A'; break; case REQ_OP_ZONE_FINISH: rwbs[i++] = 'Z'; rwbs[i++] = 'F'; break; case REQ_OP_ZONE_OPEN: rwbs[i++] = 'Z'; rwbs[i++] = 'O'; break; case REQ_OP_ZONE_CLOSE: rwbs[i++] = 'Z'; rwbs[i++] = 'C'; break; default: rwbs[i++] = 'N'; } if (opf & REQ_FUA) rwbs[i++] = 'F'; if (opf & REQ_RAHEAD) rwbs[i++] = 'A'; if (opf & REQ_SYNC) rwbs[i++] = 'S'; if (opf & REQ_META) rwbs[i++] = 'M'; if (opf & REQ_ATOMIC) rwbs[i++] = 'U'; WARN_ON_ONCE(i >= RWBS_LEN); rwbs[i] = '\0'; } EXPORT_SYMBOL_GPL(blk_fill_rwbs); #endif /* CONFIG_EVENT_TRACING */
5 22 1 15 15 1 1 2079 10114 66 11 8 8 1 5 8 1 7 7 4 156 4 4 2 10287 8 42 22 22 70 70 53 36 53 2 6183 10310 102 10281 10251 6197 6193 80 110 102 35 108 35 2 2 2 10261 10256 10024 10030 34 22 26 10058 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * VLAN An implementation of 802.1Q VLAN tagging. * * Authors: Ben Greear <greearb@candelatech.com> */ #ifndef _LINUX_IF_VLAN_H_ #define _LINUX_IF_VLAN_H_ #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <linux/bug.h> #include <uapi/linux/if_vlan.h> #define VLAN_HLEN 4 /* The additional bytes required by VLAN * (in addition to the Ethernet header) */ #define VLAN_ETH_HLEN 18 /* Total octets in header. */ #define VLAN_ETH_ZLEN 64 /* Min. octets in frame sans FCS */ /* * According to 802.3ac, the packet can be 4 bytes longer. --Klika Jan */ #define VLAN_ETH_DATA_LEN 1500 /* Max. octets in payload */ #define VLAN_ETH_FRAME_LEN 1518 /* Max. octets in frame sans FCS */ #define VLAN_MAX_DEPTH 8 /* Max. number of nested VLAN tags parsed */ /* * struct vlan_hdr - vlan header * @h_vlan_TCI: priority and VLAN ID * @h_vlan_encapsulated_proto: packet type ID or len */ struct vlan_hdr { __be16 h_vlan_TCI; __be16 h_vlan_encapsulated_proto; }; /** * struct vlan_ethhdr - vlan ethernet header (ethhdr + vlan_hdr) * @h_dest: destination ethernet address * @h_source: source ethernet address * @h_vlan_proto: ethernet protocol * @h_vlan_TCI: priority and VLAN ID * @h_vlan_encapsulated_proto: packet type ID or len */ struct vlan_ethhdr { struct_group(addrs, unsigned char h_dest[ETH_ALEN]; unsigned char h_source[ETH_ALEN]; ); __be16 h_vlan_proto; __be16 h_vlan_TCI; __be16 h_vlan_encapsulated_proto; }; #include <linux/skbuff.h> static inline struct vlan_ethhdr *vlan_eth_hdr(const struct sk_buff *skb) { return (struct vlan_ethhdr *)skb_mac_header(skb); } /* Prefer this version in TX path, instead of * skb_reset_mac_header() + vlan_eth_hdr() */ static inline struct vlan_ethhdr *skb_vlan_eth_hdr(const struct sk_buff *skb) { return (struct vlan_ethhdr *)skb->data; } #define VLAN_PRIO_MASK 0xe000 /* Priority Code Point */ #define VLAN_PRIO_SHIFT 13 #define VLAN_CFI_MASK 0x1000 /* Canonical Format Indicator / Drop Eligible Indicator */ #define VLAN_VID_MASK 0x0fff /* VLAN Identifier */ #define VLAN_N_VID 4096 /* found in socket.c */ extern void vlan_ioctl_set(int (*hook)(struct net *, void __user *)); #define skb_vlan_tag_present(__skb) (!!(__skb)->vlan_all) #define skb_vlan_tag_get(__skb) ((__skb)->vlan_tci) #define skb_vlan_tag_get_id(__skb) ((__skb)->vlan_tci & VLAN_VID_MASK) #define skb_vlan_tag_get_cfi(__skb) (!!((__skb)->vlan_tci & VLAN_CFI_MASK)) #define skb_vlan_tag_get_prio(__skb) (((__skb)->vlan_tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT) static inline int vlan_get_rx_ctag_filter_info(struct net_device *dev) { ASSERT_RTNL(); return notifier_to_errno(call_netdevice_notifiers(NETDEV_CVLAN_FILTER_PUSH_INFO, dev)); } static inline void vlan_drop_rx_ctag_filter_info(struct net_device *dev) { ASSERT_RTNL(); call_netdevice_notifiers(NETDEV_CVLAN_FILTER_DROP_INFO, dev); } static inline int vlan_get_rx_stag_filter_info(struct net_device *dev) { ASSERT_RTNL(); return notifier_to_errno(call_netdevice_notifiers(NETDEV_SVLAN_FILTER_PUSH_INFO, dev)); } static inline void vlan_drop_rx_stag_filter_info(struct net_device *dev) { ASSERT_RTNL(); call_netdevice_notifiers(NETDEV_SVLAN_FILTER_DROP_INFO, dev); } /** * struct vlan_pcpu_stats - VLAN percpu rx/tx stats * @rx_packets: number of received packets * @rx_bytes: number of received bytes * @rx_multicast: number of received multicast packets * @tx_packets: number of transmitted packets * @tx_bytes: number of transmitted bytes * @syncp: synchronization point for 64bit counters * @rx_errors: number of rx errors * @tx_dropped: number of tx drops */ struct vlan_pcpu_stats { u64_stats_t rx_packets; u64_stats_t rx_bytes; u64_stats_t rx_multicast; u64_stats_t tx_packets; u64_stats_t tx_bytes; struct u64_stats_sync syncp; u32 rx_errors; u32 tx_dropped; }; #if IS_ENABLED(CONFIG_VLAN_8021Q) extern struct net_device *__vlan_find_dev_deep_rcu(struct net_device *real_dev, __be16 vlan_proto, u16 vlan_id); extern int vlan_for_each(struct net_device *dev, int (*action)(struct net_device *dev, int vid, void *arg), void *arg); extern struct net_device *vlan_dev_real_dev(const struct net_device *dev); extern u16 vlan_dev_vlan_id(const struct net_device *dev); extern __be16 vlan_dev_vlan_proto(const struct net_device *dev); /** * struct vlan_priority_tci_mapping - vlan egress priority mappings * @priority: skb priority * @vlan_qos: vlan priority: (skb->priority << 13) & 0xE000 * @next: pointer to next struct */ struct vlan_priority_tci_mapping { u32 priority; u16 vlan_qos; struct vlan_priority_tci_mapping *next; }; struct proc_dir_entry; struct netpoll; /** * struct vlan_dev_priv - VLAN private device data * @nr_ingress_mappings: number of ingress priority mappings * @ingress_priority_map: ingress priority mappings * @nr_egress_mappings: number of egress priority mappings * @egress_priority_map: hash of egress priority mappings * @vlan_proto: VLAN encapsulation protocol * @vlan_id: VLAN identifier * @flags: device flags * @real_dev: underlying netdevice * @dev_tracker: refcount tracker for @real_dev reference * @real_dev_addr: address of underlying netdevice * @dent: proc dir entry * @vlan_pcpu_stats: ptr to percpu rx stats * @netpoll: netpoll instance "propagated" down to @real_dev */ struct vlan_dev_priv { unsigned int nr_ingress_mappings; u32 ingress_priority_map[8]; unsigned int nr_egress_mappings; struct vlan_priority_tci_mapping *egress_priority_map[16]; __be16 vlan_proto; u16 vlan_id; u16 flags; struct net_device *real_dev; netdevice_tracker dev_tracker; unsigned char real_dev_addr[ETH_ALEN]; struct proc_dir_entry *dent; struct vlan_pcpu_stats __percpu *vlan_pcpu_stats; #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *netpoll; #endif }; static inline bool is_vlan_dev(const struct net_device *dev) { return dev->priv_flags & IFF_802_1Q_VLAN; } static inline struct vlan_dev_priv *vlan_dev_priv(const struct net_device *dev) { return netdev_priv(dev); } static inline u16 vlan_dev_get_egress_qos_mask(struct net_device *dev, u32 skprio) { struct vlan_priority_tci_mapping *mp; smp_rmb(); /* coupled with smp_wmb() in vlan_dev_set_egress_priority() */ mp = vlan_dev_priv(dev)->egress_priority_map[(skprio & 0xF)]; while (mp) { if (mp->priority == skprio) { return mp->vlan_qos; /* This should already be shifted * to mask correctly with the * VLAN's TCI */ } mp = mp->next; } return 0; } extern bool vlan_do_receive(struct sk_buff **skb); extern int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid); extern void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid); extern int vlan_vids_add_by_dev(struct net_device *dev, const struct net_device *by_dev); extern void vlan_vids_del_by_dev(struct net_device *dev, const struct net_device *by_dev); extern bool vlan_uses_dev(const struct net_device *dev); #else static inline bool is_vlan_dev(const struct net_device *dev) { return false; } static inline struct net_device * __vlan_find_dev_deep_rcu(struct net_device *real_dev, __be16 vlan_proto, u16 vlan_id) { return NULL; } static inline int vlan_for_each(struct net_device *dev, int (*action)(struct net_device *dev, int vid, void *arg), void *arg) { return 0; } static inline struct net_device *vlan_dev_real_dev(const struct net_device *dev) { WARN_ON_ONCE(1); return NULL; } static inline u16 vlan_dev_vlan_id(const struct net_device *dev) { WARN_ON_ONCE(1); return 0; } static inline __be16 vlan_dev_vlan_proto(const struct net_device *dev) { WARN_ON_ONCE(1); return 0; } static inline u16 vlan_dev_get_egress_qos_mask(struct net_device *dev, u32 skprio) { return 0; } static inline bool vlan_do_receive(struct sk_buff **skb) { return false; } static inline int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid) { return 0; } static inline void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid) { } static inline int vlan_vids_add_by_dev(struct net_device *dev, const struct net_device *by_dev) { return 0; } static inline void vlan_vids_del_by_dev(struct net_device *dev, const struct net_device *by_dev) { } static inline bool vlan_uses_dev(const struct net_device *dev) { return false; } #endif /** * eth_type_vlan - check for valid vlan ether type. * @ethertype: ether type to check * * Returns: true if the ether type is a vlan ether type. */ static inline bool eth_type_vlan(__be16 ethertype) { switch (ethertype) { case htons(ETH_P_8021Q): case htons(ETH_P_8021AD): return true; default: return false; } } static inline bool vlan_hw_offload_capable(netdev_features_t features, __be16 proto) { if (proto == htons(ETH_P_8021Q) && features & NETIF_F_HW_VLAN_CTAG_TX) return true; if (proto == htons(ETH_P_8021AD) && features & NETIF_F_HW_VLAN_STAG_TX) return true; return false; } /** * __vlan_insert_inner_tag - inner VLAN tag inserting * @skb: skbuff to tag * @vlan_proto: VLAN encapsulation protocol * @vlan_tci: VLAN TCI to insert * @mac_len: MAC header length including outer vlan headers * * Inserts the VLAN tag into @skb as part of the payload at offset mac_len * Does not change skb->protocol so this function can be used during receive. * * Returns: error if skb_cow_head fails. */ static inline int __vlan_insert_inner_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci, unsigned int mac_len) { struct vlan_ethhdr *veth; if (skb_cow_head(skb, VLAN_HLEN) < 0) return -ENOMEM; skb_push(skb, VLAN_HLEN); /* Move the mac header sans proto to the beginning of the new header. */ if (likely(mac_len > ETH_TLEN)) memmove(skb->data, skb->data + VLAN_HLEN, mac_len - ETH_TLEN); if (skb_mac_header_was_set(skb)) skb->mac_header -= VLAN_HLEN; veth = (struct vlan_ethhdr *)(skb->data + mac_len - ETH_HLEN); /* first, the ethernet type */ if (likely(mac_len >= ETH_TLEN)) { /* h_vlan_encapsulated_proto should already be populated, and * skb->data has space for h_vlan_proto */ veth->h_vlan_proto = vlan_proto; } else { /* h_vlan_encapsulated_proto should not be populated, and * skb->data has no space for h_vlan_proto */ veth->h_vlan_encapsulated_proto = skb->protocol; } /* now, the TCI */ veth->h_vlan_TCI = htons(vlan_tci); return 0; } /** * __vlan_insert_tag - regular VLAN tag inserting * @skb: skbuff to tag * @vlan_proto: VLAN encapsulation protocol * @vlan_tci: VLAN TCI to insert * * Inserts the VLAN tag into @skb as part of the payload * Does not change skb->protocol so this function can be used during receive. * * Returns: error if skb_cow_head fails. */ static inline int __vlan_insert_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) { return __vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, ETH_HLEN); } /** * vlan_insert_inner_tag - inner VLAN tag inserting * @skb: skbuff to tag * @vlan_proto: VLAN encapsulation protocol * @vlan_tci: VLAN TCI to insert * @mac_len: MAC header length including outer vlan headers * * Inserts the VLAN tag into @skb as part of the payload at offset mac_len * Returns a VLAN tagged skb. This might change skb->head. * * Following the skb_unshare() example, in case of error, the calling function * doesn't have to worry about freeing the original skb. * * Does not change skb->protocol so this function can be used during receive. * * Return: modified @skb on success, NULL on error (@skb is freed). */ static inline struct sk_buff *vlan_insert_inner_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci, unsigned int mac_len) { int err; err = __vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, mac_len); if (err) { dev_kfree_skb_any(skb); return NULL; } return skb; } /** * vlan_insert_tag - regular VLAN tag inserting * @skb: skbuff to tag * @vlan_proto: VLAN encapsulation protocol * @vlan_tci: VLAN TCI to insert * * Inserts the VLAN tag into @skb as part of the payload * Returns a VLAN tagged skb. This might change skb->head. * * Following the skb_unshare() example, in case of error, the calling function * doesn't have to worry about freeing the original skb. * * Does not change skb->protocol so this function can be used during receive. * * Return: modified @skb on success, NULL on error (@skb is freed). */ static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) { return vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, ETH_HLEN); } /** * vlan_insert_tag_set_proto - regular VLAN tag inserting * @skb: skbuff to tag * @vlan_proto: VLAN encapsulation protocol * @vlan_tci: VLAN TCI to insert * * Inserts the VLAN tag into @skb as part of the payload * Returns a VLAN tagged skb. This might change skb->head. * * Following the skb_unshare() example, in case of error, the calling function * doesn't have to worry about freeing the original skb. * * Return: modified @skb on success, NULL on error (@skb is freed). */ static inline struct sk_buff *vlan_insert_tag_set_proto(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) { skb = vlan_insert_tag(skb, vlan_proto, vlan_tci); if (skb) skb->protocol = vlan_proto; return skb; } /** * __vlan_hwaccel_clear_tag - clear hardware accelerated VLAN info * @skb: skbuff to clear * * Clears the VLAN information from @skb */ static inline void __vlan_hwaccel_clear_tag(struct sk_buff *skb) { skb->vlan_all = 0; } /** * __vlan_hwaccel_copy_tag - copy hardware accelerated VLAN info from another skb * @dst: skbuff to copy to * @src: skbuff to copy from * * Copies VLAN information from @src to @dst (for branchless code) */ static inline void __vlan_hwaccel_copy_tag(struct sk_buff *dst, const struct sk_buff *src) { dst->vlan_all = src->vlan_all; } /* * __vlan_hwaccel_push_inside - pushes vlan tag to the payload * @skb: skbuff to tag * * Pushes the VLAN tag from @skb->vlan_tci inside to the payload. * * Following the skb_unshare() example, in case of error, the calling function * doesn't have to worry about freeing the original skb. */ static inline struct sk_buff *__vlan_hwaccel_push_inside(struct sk_buff *skb) { skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, skb_vlan_tag_get(skb)); if (likely(skb)) __vlan_hwaccel_clear_tag(skb); return skb; } /** * __vlan_hwaccel_put_tag - hardware accelerated VLAN inserting * @skb: skbuff to tag * @vlan_proto: VLAN encapsulation protocol * @vlan_tci: VLAN TCI to insert * * Puts the VLAN TCI in @skb->vlan_tci and lets the device do the rest */ static inline void __vlan_hwaccel_put_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) { skb->vlan_proto = vlan_proto; skb->vlan_tci = vlan_tci; } /** * __vlan_get_tag - get the VLAN ID that is part of the payload * @skb: skbuff to query * @vlan_tci: buffer to store value * * Returns: error if the skb is not of VLAN type */ static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) { struct vlan_ethhdr *veth = skb_vlan_eth_hdr(skb); if (!eth_type_vlan(veth->h_vlan_proto)) return -ENODATA; *vlan_tci = ntohs(veth->h_vlan_TCI); return 0; } /** * __vlan_hwaccel_get_tag - get the VLAN ID that is in @skb->cb[] * @skb: skbuff to query * @vlan_tci: buffer to store value * * Returns: error if @skb->vlan_tci is not set correctly */ static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb, u16 *vlan_tci) { if (skb_vlan_tag_present(skb)) { *vlan_tci = skb_vlan_tag_get(skb); return 0; } else { *vlan_tci = 0; return -ENODATA; } } /** * vlan_get_tag - get the VLAN ID from the skb * @skb: skbuff to query * @vlan_tci: buffer to store value * * Returns: error if the skb is not VLAN tagged */ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) { if (skb->dev->features & NETIF_F_HW_VLAN_CTAG_TX) { return __vlan_hwaccel_get_tag(skb, vlan_tci); } else { return __vlan_get_tag(skb, vlan_tci); } } /** * __vlan_get_protocol_offset() - get protocol EtherType. * @skb: skbuff to query * @type: first vlan protocol * @mac_offset: MAC offset * @depth: buffer to store length of eth and vlan tags in bytes * * Returns: the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ static inline __be16 __vlan_get_protocol_offset(const struct sk_buff *skb, __be16 type, int mac_offset, int *depth) { unsigned int vlan_depth = skb->mac_len, parse_depth = VLAN_MAX_DEPTH; /* if type is 802.1Q/AD then the header should already be * present at mac_len - VLAN_HLEN (if mac_len > 0), or at * ETH_HLEN otherwise */ if (eth_type_vlan(type)) { if (vlan_depth) { if (WARN_ON(vlan_depth < VLAN_HLEN)) return 0; vlan_depth -= VLAN_HLEN; } else { vlan_depth = ETH_HLEN; } do { struct vlan_hdr vhdr, *vh; vh = skb_header_pointer(skb, mac_offset + vlan_depth, sizeof(vhdr), &vhdr); if (unlikely(!vh || !--parse_depth)) return 0; type = vh->h_vlan_encapsulated_proto; vlan_depth += VLAN_HLEN; } while (eth_type_vlan(type)); } if (depth) *depth = vlan_depth; return type; } static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, int *depth) { return __vlan_get_protocol_offset(skb, type, 0, depth); } /** * vlan_get_protocol - get protocol EtherType. * @skb: skbuff to query * * Returns: the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ static inline __be16 vlan_get_protocol(const struct sk_buff *skb) { return __vlan_get_protocol(skb, skb->protocol, NULL); } /* This version of __vlan_get_protocol() also pulls mac header in skb->head */ static inline __be16 vlan_get_protocol_and_depth(struct sk_buff *skb, __be16 type, int *depth) { int maclen; type = __vlan_get_protocol(skb, type, &maclen); if (type) { if (!pskb_may_pull(skb, maclen)) type = 0; else if (depth) *depth = maclen; } return type; } /* A getter for the SKB protocol field which will handle VLAN tags consistently * whether VLAN acceleration is enabled or not. */ static inline __be16 skb_protocol(const struct sk_buff *skb, bool skip_vlan) { if (!skip_vlan) /* VLAN acceleration strips the VLAN header from the skb and * moves it to skb->vlan_proto */ return skb_vlan_tag_present(skb) ? skb->vlan_proto : skb->protocol; return vlan_get_protocol(skb); } static inline void vlan_set_encap_proto(struct sk_buff *skb, struct vlan_hdr *vhdr) { __be16 proto; unsigned short *rawp; /* * Was a VLAN packet, grab the encapsulated protocol, which the layer * three protocols care about. */ proto = vhdr->h_vlan_encapsulated_proto; if (eth_proto_is_802_3(proto)) { skb->protocol = proto; return; } rawp = (unsigned short *)(vhdr + 1); if (*rawp == 0xFFFF) /* * This is a magic hack to spot IPX packets. Older Novell * breaks the protocol design and runs IPX over 802.3 without * an 802.2 LLC layer. We look for FFFF which isn't a used * 802.2 SSAP/DSAP. This won't work for fault tolerant netware * but does for the rest. */ skb->protocol = htons(ETH_P_802_3); else /* * Real 802.2 LLC */ skb->protocol = htons(ETH_P_802_2); } /** * vlan_remove_tag - remove outer VLAN tag from payload * @skb: skbuff to remove tag from * @vlan_tci: buffer to store value * * Expects the skb to contain a VLAN tag in the payload, and to have skb->data * pointing at the MAC header. * * Returns: a new pointer to skb->data, or NULL on failure to pull. */ static inline void *vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci) { struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); *vlan_tci = ntohs(vhdr->h_vlan_TCI); memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); vlan_set_encap_proto(skb, vhdr); return __skb_pull(skb, VLAN_HLEN); } /** * skb_vlan_tagged - check if skb is vlan tagged. * @skb: skbuff to query * * Returns: true if the skb is tagged, regardless of whether it is hardware * accelerated or not. */ static inline bool skb_vlan_tagged(const struct sk_buff *skb) { if (!skb_vlan_tag_present(skb) && likely(!eth_type_vlan(skb->protocol))) return false; return true; } /** * skb_vlan_tagged_multi - check if skb is vlan tagged with multiple headers. * @skb: skbuff to query * * Returns: true if the skb is tagged with multiple vlan headers, regardless * of whether it is hardware accelerated or not. */ static inline bool skb_vlan_tagged_multi(struct sk_buff *skb) { __be16 protocol = skb->protocol; if (!skb_vlan_tag_present(skb)) { struct vlan_ethhdr *veh; if (likely(!eth_type_vlan(protocol))) return false; if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN))) return false; veh = skb_vlan_eth_hdr(skb); protocol = veh->h_vlan_encapsulated_proto; } if (!eth_type_vlan(protocol)) return false; return true; } /** * vlan_features_check - drop unsafe features for skb with multiple tags. * @skb: skbuff to query * @features: features to be checked * * Returns: features without unsafe ones if the skb has multiple tags. */ static inline netdev_features_t vlan_features_check(struct sk_buff *skb, netdev_features_t features) { if (skb_vlan_tagged_multi(skb)) { /* In the case of multi-tagged packets, use a direct mask * instead of using netdev_interesect_features(), to make * sure that only devices supporting NETIF_F_HW_CSUM will * have checksum offloading support. */ features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; } return features; } /** * compare_vlan_header - Compare two vlan headers * @h1: Pointer to vlan header * @h2: Pointer to vlan header * * Compare two vlan headers. * * Please note that alignment of h1 & h2 are only guaranteed to be 16 bits. * * Return: 0 if equal, arbitrary non-zero value if not equal. */ static inline unsigned long compare_vlan_header(const struct vlan_hdr *h1, const struct vlan_hdr *h2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) return *(u32 *)h1 ^ *(u32 *)h2; #else return ((__force u32)h1->h_vlan_TCI ^ (__force u32)h2->h_vlan_TCI) | ((__force u32)h1->h_vlan_encapsulated_proto ^ (__force u32)h2->h_vlan_encapsulated_proto); #endif } #endif /* !(_LINUX_IF_VLAN_H_) */
56 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 /* * net/tipc/eth_media.c: Ethernet bearer support for TIPC * * Copyright (c) 2001-2007, 2013-2014, Ericsson AB * Copyright (c) 2005-2008, 2011-2013, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "bearer.h" /* Convert Ethernet address (media address format) to string */ static int tipc_eth_addr2str(struct tipc_media_addr *addr, char *strbuf, int bufsz) { if (bufsz < 18) /* 18 = strlen("aa:bb:cc:dd:ee:ff\0") */ return 1; sprintf(strbuf, "%pM", addr->value); return 0; } /* Convert from media address format to discovery message addr format */ static int tipc_eth_addr2msg(char *msg, struct tipc_media_addr *addr) { memset(msg, 0, TIPC_MEDIA_INFO_SIZE); msg[TIPC_MEDIA_TYPE_OFFSET] = TIPC_MEDIA_TYPE_ETH; memcpy(msg + TIPC_MEDIA_ADDR_OFFSET, addr->value, ETH_ALEN); return 0; } /* Convert raw mac address format to media addr format */ static int tipc_eth_raw2addr(struct tipc_bearer *b, struct tipc_media_addr *addr, const char *msg) { memset(addr, 0, sizeof(*addr)); ether_addr_copy(addr->value, msg); addr->media_id = TIPC_MEDIA_TYPE_ETH; addr->broadcast = is_broadcast_ether_addr(addr->value); return 0; } /* Convert discovery msg addr format to Ethernet media addr format */ static int tipc_eth_msg2addr(struct tipc_bearer *b, struct tipc_media_addr *addr, char *msg) { /* Skip past preamble: */ msg += TIPC_MEDIA_ADDR_OFFSET; return tipc_eth_raw2addr(b, addr, msg); } /* Ethernet media registration info */ struct tipc_media eth_media_info = { .send_msg = tipc_l2_send_msg, .enable_media = tipc_enable_l2_media, .disable_media = tipc_disable_l2_media, .addr2str = tipc_eth_addr2str, .addr2msg = tipc_eth_addr2msg, .msg2addr = tipc_eth_msg2addr, .raw2addr = tipc_eth_raw2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, .min_win = TIPC_DEF_LINK_WIN, .max_win = TIPC_MAX_LINK_WIN, .type_id = TIPC_MEDIA_TYPE_ETH, .hwaddr_len = ETH_ALEN, .name = "eth" };
3735 3741 3726 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 // SPDX-License-Identifier: GPL-2.0-or-later /* * Derived from arch/ppc/mm/extable.c and arch/i386/mm/extable.c. * * Copyright (C) 2004 Paul Mackerras, IBM Corp. */ #include <linux/bsearch.h> #include <linux/module.h> #include <linux/init.h> #include <linux/sort.h> #include <linux/uaccess.h> #include <linux/extable.h> #ifndef ARCH_HAS_RELATIVE_EXTABLE #define ex_to_insn(x) ((x)->insn) #else static inline unsigned long ex_to_insn(const struct exception_table_entry *x) { return (unsigned long)&x->insn + x->insn; } #endif #ifndef ARCH_HAS_RELATIVE_EXTABLE #define swap_ex NULL #else static void swap_ex(void *a, void *b, int size) { struct exception_table_entry *x = a, *y = b, tmp; int delta = b - a; tmp = *x; x->insn = y->insn + delta; y->insn = tmp.insn - delta; #ifdef swap_ex_entry_fixup swap_ex_entry_fixup(x, y, tmp, delta); #else x->fixup = y->fixup + delta; y->fixup = tmp.fixup - delta; #endif } #endif /* ARCH_HAS_RELATIVE_EXTABLE */ /* * The exception table needs to be sorted so that the binary * search that we use to find entries in it works properly. * This is used both for the kernel exception table and for * the exception tables of modules that get loaded. */ static int cmp_ex_sort(const void *a, const void *b) { const struct exception_table_entry *x = a, *y = b; /* avoid overflow */ if (ex_to_insn(x) > ex_to_insn(y)) return 1; if (ex_to_insn(x) < ex_to_insn(y)) return -1; return 0; } void sort_extable(struct exception_table_entry *start, struct exception_table_entry *finish) { sort(start, finish - start, sizeof(struct exception_table_entry), cmp_ex_sort, swap_ex); } #ifdef CONFIG_MODULES /* * If the exception table is sorted, any referring to the module init * will be at the beginning or the end. */ void trim_init_extable(struct module *m) { /*trim the beginning*/ while (m->num_exentries && within_module_init(ex_to_insn(&m->extable[0]), m)) { m->extable++; m->num_exentries--; } /*trim the end*/ while (m->num_exentries && within_module_init(ex_to_insn(&m->extable[m->num_exentries - 1]), m)) m->num_exentries--; } #endif /* CONFIG_MODULES */ static int cmp_ex_search(const void *key, const void *elt) { const struct exception_table_entry *_elt = elt; unsigned long _key = *(unsigned long *)key; /* avoid overflow */ if (_key > ex_to_insn(_elt)) return 1; if (_key < ex_to_insn(_elt)) return -1; return 0; } /* * Search one exception table for an entry corresponding to the * given instruction address, and return the address of the entry, * or NULL if none is found. * We use a binary search, and thus we assume that the table is * already sorted. */ const struct exception_table_entry * search_extable(const struct exception_table_entry *base, const size_t num, unsigned long value) { return bsearch(&value, base, num, sizeof(struct exception_table_entry), cmp_ex_search); }
190 191 9 191 198 186 274 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef __SOUND_CORE_H #define __SOUND_CORE_H /* * Main header file for the ALSA driver * Copyright (c) 1994-2001 by Jaroslav Kysela <perex@perex.cz> */ #include <linux/device.h> #include <linux/sched.h> /* wake_up() */ #include <linux/mutex.h> /* struct mutex */ #include <linux/rwsem.h> /* struct rw_semaphore */ #include <linux/pm.h> /* pm_message_t */ #include <linux/stringify.h> #include <linux/printk.h> #include <linux/xarray.h> /* number of supported soundcards */ #ifdef CONFIG_SND_DYNAMIC_MINORS #define SNDRV_CARDS CONFIG_SND_MAX_CARDS #else #define SNDRV_CARDS 8 /* don't change - minor numbers */ #endif #define CONFIG_SND_MAJOR 116 /* standard configuration */ /* forward declarations */ struct pci_dev; struct module; struct completion; /* device allocation stuff */ /* type of the object used in snd_device_*() * this also defines the calling order */ enum snd_device_type { SNDRV_DEV_LOWLEVEL, SNDRV_DEV_INFO, SNDRV_DEV_BUS, SNDRV_DEV_CODEC, SNDRV_DEV_PCM, SNDRV_DEV_COMPRESS, SNDRV_DEV_RAWMIDI, SNDRV_DEV_TIMER, SNDRV_DEV_SEQUENCER, SNDRV_DEV_HWDEP, SNDRV_DEV_JACK, SNDRV_DEV_CONTROL, /* NOTE: this must be the last one */ }; enum snd_device_state { SNDRV_DEV_BUILD, SNDRV_DEV_REGISTERED, SNDRV_DEV_DISCONNECTED, }; struct snd_device; struct snd_device_ops { int (*dev_free)(struct snd_device *dev); int (*dev_register)(struct snd_device *dev); int (*dev_disconnect)(struct snd_device *dev); }; struct snd_device { struct list_head list; /* list of registered devices */ struct snd_card *card; /* card which holds this device */ enum snd_device_state state; /* state of the device */ enum snd_device_type type; /* device type */ void *device_data; /* device structure */ const struct snd_device_ops *ops; /* operations */ }; #define snd_device(n) list_entry(n, struct snd_device, list) /* main structure for soundcard */ struct snd_card { int number; /* number of soundcard (index to snd_cards) */ char id[16]; /* id string of this card */ char driver[16]; /* driver name */ char shortname[32]; /* short name of this soundcard */ char longname[80]; /* name of this soundcard */ char irq_descr[32]; /* Interrupt description */ char mixername[80]; /* mixer name */ char components[128]; /* card components delimited with space */ struct module *module; /* top-level module */ void *private_data; /* private data for soundcard */ void (*private_free) (struct snd_card *card); /* callback for freeing of private data */ struct list_head devices; /* devices */ struct device *ctl_dev; /* control device */ unsigned int last_numid; /* last used numeric ID */ struct rw_semaphore controls_rwsem; /* controls lock (list and values) */ rwlock_t controls_rwlock; /* lock for lookup and ctl_files list */ int controls_count; /* count of all controls */ size_t user_ctl_alloc_size; // current memory allocation by user controls. struct list_head controls; /* all controls for this card */ struct list_head ctl_files; /* active control files */ #ifdef CONFIG_SND_CTL_FAST_LOOKUP struct xarray ctl_numids; /* hash table for numids */ struct xarray ctl_hash; /* hash table for ctl id matching */ bool ctl_hash_collision; /* ctl_hash collision seen? */ #endif struct snd_info_entry *proc_root; /* root for soundcard specific files */ struct proc_dir_entry *proc_root_link; /* number link to real id */ struct list_head files_list; /* all files associated to this card */ struct snd_shutdown_f_ops *s_f_ops; /* file operations in the shutdown state */ spinlock_t files_lock; /* lock the files for this card */ int shutdown; /* this card is going down */ struct completion *release_completion; struct device *dev; /* device assigned to this card */ struct device card_dev; /* cardX object for sysfs */ const struct attribute_group *dev_groups[4]; /* assigned sysfs attr */ bool registered; /* card_dev is registered? */ bool managed; /* managed via devres */ bool releasing; /* during card free process */ int sync_irq; /* assigned irq, used for PCM sync */ wait_queue_head_t remove_sleep; size_t total_pcm_alloc_bytes; /* total amount of allocated buffers */ struct mutex memory_mutex; /* protection for the above */ #ifdef CONFIG_SND_DEBUG struct dentry *debugfs_root; /* debugfs root for card */ #endif #ifdef CONFIG_PM unsigned int power_state; /* power state */ atomic_t power_ref; wait_queue_head_t power_sleep; wait_queue_head_t power_ref_sleep; #endif #if IS_ENABLED(CONFIG_SND_MIXER_OSS) struct snd_mixer_oss *mixer_oss; int mixer_oss_change_count; #endif }; #define dev_to_snd_card(p) container_of(p, struct snd_card, card_dev) #ifdef CONFIG_PM static inline unsigned int snd_power_get_state(struct snd_card *card) { return READ_ONCE(card->power_state); } static inline void snd_power_change_state(struct snd_card *card, unsigned int state) { WRITE_ONCE(card->power_state, state); wake_up(&card->power_sleep); } /** * snd_power_ref - Take the reference count for power control * @card: sound card object * * The power_ref reference of the card is used for managing to block * the snd_power_sync_ref() operation. This function increments the reference. * The counterpart snd_power_unref() has to be called appropriately later. */ static inline void snd_power_ref(struct snd_card *card) { atomic_inc(&card->power_ref); } /** * snd_power_unref - Release the reference count for power control * @card: sound card object */ static inline void snd_power_unref(struct snd_card *card) { if (atomic_dec_and_test(&card->power_ref)) wake_up(&card->power_ref_sleep); } /** * snd_power_sync_ref - wait until the card power_ref is freed * @card: sound card object * * This function is used to synchronize with the pending power_ref being * released. */ static inline void snd_power_sync_ref(struct snd_card *card) { wait_event(card->power_ref_sleep, !atomic_read(&card->power_ref)); } /* init.c */ int snd_power_wait(struct snd_card *card); int snd_power_ref_and_wait(struct snd_card *card); #else /* ! CONFIG_PM */ static inline int snd_power_wait(struct snd_card *card) { return 0; } static inline void snd_power_ref(struct snd_card *card) {} static inline void snd_power_unref(struct snd_card *card) {} static inline int snd_power_ref_and_wait(struct snd_card *card) { return 0; } static inline void snd_power_sync_ref(struct snd_card *card) {} #define snd_power_get_state(card) ({ (void)(card); SNDRV_CTL_POWER_D0; }) #define snd_power_change_state(card, state) do { (void)(card); } while (0) #endif /* CONFIG_PM */ struct snd_minor { int type; /* SNDRV_DEVICE_TYPE_XXX */ int card; /* card number */ int device; /* device number */ const struct file_operations *f_ops; /* file operations */ void *private_data; /* private data for f_ops->open */ struct device *dev; /* device for sysfs */ struct snd_card *card_ptr; /* assigned card instance */ }; /* return a device pointer linked to each sound device as a parent */ static inline struct device *snd_card_get_device_link(struct snd_card *card) { return card ? &card->card_dev : NULL; } /* sound.c */ extern int snd_major; extern int snd_ecards_limit; extern const struct class sound_class; #ifdef CONFIG_SND_DEBUG extern struct dentry *sound_debugfs_root; #endif void snd_request_card(int card); int snd_device_alloc(struct device **dev_p, struct snd_card *card); int snd_register_device(int type, struct snd_card *card, int dev, const struct file_operations *f_ops, void *private_data, struct device *device); int snd_unregister_device(struct device *dev); void *snd_lookup_minor_data(unsigned int minor, int type); #ifdef CONFIG_SND_OSSEMUL int snd_register_oss_device(int type, struct snd_card *card, int dev, const struct file_operations *f_ops, void *private_data); int snd_unregister_oss_device(int type, struct snd_card *card, int dev); void *snd_lookup_oss_minor_data(unsigned int minor, int type); #endif int snd_minor_info_init(void); /* sound_oss.c */ #ifdef CONFIG_SND_OSSEMUL int snd_minor_info_oss_init(void); #else static inline int snd_minor_info_oss_init(void) { return 0; } #endif /* memory.c */ int copy_to_user_fromio(void __user *dst, const volatile void __iomem *src, size_t count); int copy_from_user_toio(volatile void __iomem *dst, const void __user *src, size_t count); /* init.c */ int snd_card_locked(int card); #if IS_ENABLED(CONFIG_SND_MIXER_OSS) #define SND_MIXER_OSS_NOTIFY_REGISTER 0 #define SND_MIXER_OSS_NOTIFY_DISCONNECT 1 #define SND_MIXER_OSS_NOTIFY_FREE 2 extern int (*snd_mixer_oss_notify_callback)(struct snd_card *card, int cmd); #endif int snd_card_new(struct device *parent, int idx, const char *xid, struct module *module, int extra_size, struct snd_card **card_ret); int snd_devm_card_new(struct device *parent, int idx, const char *xid, struct module *module, size_t extra_size, struct snd_card **card_ret); void snd_card_disconnect(struct snd_card *card); void snd_card_disconnect_sync(struct snd_card *card); void snd_card_free(struct snd_card *card); void snd_card_free_when_closed(struct snd_card *card); int snd_card_free_on_error(struct device *dev, int ret); void snd_card_set_id(struct snd_card *card, const char *id); int snd_card_register(struct snd_card *card); int snd_card_info_init(void); int snd_card_add_dev_attr(struct snd_card *card, const struct attribute_group *group); int snd_component_add(struct snd_card *card, const char *component); int snd_card_file_add(struct snd_card *card, struct file *file); int snd_card_file_remove(struct snd_card *card, struct file *file); struct snd_card *snd_card_ref(int card); /** * snd_card_unref - Unreference the card object * @card: the card object to unreference * * Call this function for the card object that was obtained via snd_card_ref() * or snd_lookup_minor_data(). */ static inline void snd_card_unref(struct snd_card *card) { put_device(&card->card_dev); } #define snd_card_set_dev(card, devptr) ((card)->dev = (devptr)) /* device.c */ int snd_device_new(struct snd_card *card, enum snd_device_type type, void *device_data, const struct snd_device_ops *ops); int snd_device_register(struct snd_card *card, void *device_data); int snd_device_register_all(struct snd_card *card); void snd_device_disconnect(struct snd_card *card, void *device_data); void snd_device_disconnect_all(struct snd_card *card); void snd_device_free(struct snd_card *card, void *device_data); void snd_device_free_all(struct snd_card *card); /* isadma.c */ #ifdef CONFIG_ISA_DMA_API #define DMA_MODE_NO_ENABLE 0x0100 void snd_dma_program(unsigned long dma, unsigned long addr, unsigned int size, unsigned short mode); void snd_dma_disable(unsigned long dma); unsigned int snd_dma_pointer(unsigned long dma, unsigned int size); int snd_devm_request_dma(struct device *dev, int dma, const char *name); #endif /* misc.c */ struct resource; void release_and_free_resource(struct resource *res); /* --- */ #ifdef CONFIG_SND_DEBUG /** * snd_BUG - give a BUG warning message and stack trace * * Calls WARN() if CONFIG_SND_DEBUG is set. * Ignored when CONFIG_SND_DEBUG is not set. */ #define snd_BUG() WARN(1, "BUG?\n") /** * snd_BUG_ON - debugging check macro * @cond: condition to evaluate * * Has the same behavior as WARN_ON when CONFIG_SND_DEBUG is set, * otherwise just evaluates the conditional and returns the value. */ #define snd_BUG_ON(cond) WARN_ON((cond)) #else /* !CONFIG_SND_DEBUG */ #define snd_BUG() do { } while (0) #define snd_BUG_ON(condition) ({ \ int __ret_warn_on = !!(condition); \ unlikely(__ret_warn_on); \ }) #endif /* CONFIG_SND_DEBUG */ #define SNDRV_OSS_VERSION ((3<<16)|(8<<8)|(1<<4)|(0)) /* 3.8.1a */ /* for easier backward-porting */ #if IS_ENABLED(CONFIG_GAMEPORT) #define gameport_set_dev_parent(gp,xdev) ((gp)->dev.parent = (xdev)) #define gameport_set_port_data(gp,r) ((gp)->port_data = (r)) #define gameport_get_port_data(gp) (gp)->port_data #endif /* PCI quirk list helper */ struct snd_pci_quirk { unsigned short subvendor; /* PCI subvendor ID */ unsigned short subdevice; /* PCI subdevice ID */ unsigned short subdevice_mask; /* bitmask to match */ int value; /* value */ #ifdef CONFIG_SND_DEBUG_VERBOSE const char *name; /* name of the device (optional) */ #endif }; #define _SND_PCI_QUIRK_ID_MASK(vend, mask, dev) \ .subvendor = (vend), .subdevice = (dev), .subdevice_mask = (mask) #define _SND_PCI_QUIRK_ID(vend, dev) \ _SND_PCI_QUIRK_ID_MASK(vend, 0xffff, dev) #define SND_PCI_QUIRK_ID(vend,dev) {_SND_PCI_QUIRK_ID(vend, dev)} #ifdef CONFIG_SND_DEBUG_VERBOSE #define SND_PCI_QUIRK(vend,dev,xname,val) \ {_SND_PCI_QUIRK_ID(vend, dev), .value = (val), .name = (xname)} #define SND_PCI_QUIRK_VENDOR(vend, xname, val) \ {_SND_PCI_QUIRK_ID_MASK(vend, 0, 0), .value = (val), .name = (xname)} #define SND_PCI_QUIRK_MASK(vend, mask, dev, xname, val) \ {_SND_PCI_QUIRK_ID_MASK(vend, mask, dev), \ .value = (val), .name = (xname)} #define snd_pci_quirk_name(q) ((q)->name) #else #define SND_PCI_QUIRK(vend,dev,xname,val) \ {_SND_PCI_QUIRK_ID(vend, dev), .value = (val)} #define SND_PCI_QUIRK_MASK(vend, mask, dev, xname, val) \ {_SND_PCI_QUIRK_ID_MASK(vend, mask, dev), .value = (val)} #define SND_PCI_QUIRK_VENDOR(vend, xname, val) \ {_SND_PCI_QUIRK_ID_MASK(vend, 0, 0), .value = (val)} #define snd_pci_quirk_name(q) "" #endif #ifdef CONFIG_PCI const struct snd_pci_quirk * snd_pci_quirk_lookup(struct pci_dev *pci, const struct snd_pci_quirk *list); const struct snd_pci_quirk * snd_pci_quirk_lookup_id(u16 vendor, u16 device, const struct snd_pci_quirk *list); #else static inline const struct snd_pci_quirk * snd_pci_quirk_lookup(struct pci_dev *pci, const struct snd_pci_quirk *list) { return NULL; } static inline const struct snd_pci_quirk * snd_pci_quirk_lookup_id(u16 vendor, u16 device, const struct snd_pci_quirk *list) { return NULL; } #endif /* async signal helpers */ struct snd_fasync; int snd_fasync_helper(int fd, struct file *file, int on, struct snd_fasync **fasyncp); void snd_kill_fasync(struct snd_fasync *fasync, int signal, int poll); void snd_fasync_free(struct snd_fasync *fasync); #endif /* __SOUND_CORE_H */
9 5 5 4 5 5 4 3 5 5 4 4 3 4 4 7 7 4 4 2 4 4 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 // SPDX-License-Identifier: GPL-2.0-only /* * vMTRR implementation * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. * Copyright(C) 2015 Intel Corporation. * * Authors: * Yaniv Kamay <yaniv@qumranet.com> * Avi Kivity <avi@qumranet.com> * Marcelo Tosatti <mtosatti@redhat.com> * Paolo Bonzini <pbonzini@redhat.com> * Xiao Guangrong <guangrong.xiao@linux.intel.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include <asm/mtrr.h> #include "cpuid.h" #include "x86.h" static u64 *find_mtrr(struct kvm_vcpu *vcpu, unsigned int msr) { int index; switch (msr) { case MTRRphysBase_MSR(0) ... MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1): index = msr - MTRRphysBase_MSR(0); return &vcpu->arch.mtrr_state.var[index]; case MSR_MTRRfix64K_00000: return &vcpu->arch.mtrr_state.fixed_64k; case MSR_MTRRfix16K_80000: case MSR_MTRRfix16K_A0000: index = msr - MSR_MTRRfix16K_80000; return &vcpu->arch.mtrr_state.fixed_16k[index]; case MSR_MTRRfix4K_C0000: case MSR_MTRRfix4K_C8000: case MSR_MTRRfix4K_D0000: case MSR_MTRRfix4K_D8000: case MSR_MTRRfix4K_E0000: case MSR_MTRRfix4K_E8000: case MSR_MTRRfix4K_F0000: case MSR_MTRRfix4K_F8000: index = msr - MSR_MTRRfix4K_C0000; return &vcpu->arch.mtrr_state.fixed_4k[index]; case MSR_MTRRdefType: return &vcpu->arch.mtrr_state.deftype; default: break; } return NULL; } static bool valid_mtrr_type(unsigned t) { return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ } static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) { int i; u64 mask; if (msr == MSR_MTRRdefType) { if (data & ~0xcff) return false; return valid_mtrr_type(data & 0xff); } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) { for (i = 0; i < 8 ; i++) if (!valid_mtrr_type((data >> (i * 8)) & 0xff)) return false; return true; } /* variable MTRRs */ if (WARN_ON_ONCE(!(msr >= MTRRphysBase_MSR(0) && msr <= MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1)))) return false; mask = kvm_vcpu_reserved_gpa_bits_raw(vcpu); if ((msr & 1) == 0) { /* MTRR base */ if (!valid_mtrr_type(data & 0xff)) return false; mask |= 0xf00; } else { /* MTRR mask */ mask |= 0x7ff; } return (data & mask) == 0; } int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { u64 *mtrr; mtrr = find_mtrr(vcpu, msr); if (!mtrr) return 1; if (!kvm_mtrr_valid(vcpu, msr, data)) return 1; *mtrr = data; return 0; } int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { u64 *mtrr; /* MSR_MTRRcap is a readonly MSR. */ if (msr == MSR_MTRRcap) { /* * SMRR = 0 * WC = 1 * FIX = 1 * VCNT = KVM_NR_VAR_MTRR */ *pdata = 0x500 | KVM_NR_VAR_MTRR; return 0; } mtrr = find_mtrr(vcpu, msr); if (!mtrr) return 1; *pdata = *mtrr; return 0; }
6539 13 86 26287 2337 152 30 77 3230 29576 2095 29512 5 307 257 1299 46 23 6 1301 11 645 378 1465 8555 104 1842 1 2145 750 3 6915 7333 1819 275 28 392 6 447 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_FORTIFY_STRING_H_ #define _LINUX_FORTIFY_STRING_H_ #include <linux/bitfield.h> #include <linux/bug.h> #include <linux/const.h> #include <linux/limits.h> #define __FORTIFY_INLINE extern __always_inline __gnu_inline __overloadable #define __RENAME(x) __asm__(#x) #define FORTIFY_REASON_DIR(r) FIELD_GET(BIT(0), r) #define FORTIFY_REASON_FUNC(r) FIELD_GET(GENMASK(7, 1), r) #define FORTIFY_REASON(func, write) (FIELD_PREP(BIT(0), write) | \ FIELD_PREP(GENMASK(7, 1), func)) /* Overridden by KUnit tests. */ #ifndef fortify_panic # define fortify_panic(func, write, avail, size, retfail) \ __fortify_panic(FORTIFY_REASON(func, write), avail, size) #endif #ifndef fortify_warn_once # define fortify_warn_once(x...) WARN_ONCE(x) #endif #define FORTIFY_READ 0 #define FORTIFY_WRITE 1 #define EACH_FORTIFY_FUNC(macro) \ macro(strncpy), \ macro(strnlen), \ macro(strlen), \ macro(strscpy), \ macro(strlcat), \ macro(strcat), \ macro(strncat), \ macro(memset), \ macro(memcpy), \ macro(memmove), \ macro(memscan), \ macro(memcmp), \ macro(memchr), \ macro(memchr_inv), \ macro(kmemdup), \ macro(strcpy), \ macro(UNKNOWN), #define MAKE_FORTIFY_FUNC(func) FORTIFY_FUNC_##func enum fortify_func { EACH_FORTIFY_FUNC(MAKE_FORTIFY_FUNC) }; void __fortify_report(const u8 reason, const size_t avail, const size_t size); void __fortify_panic(const u8 reason, const size_t avail, const size_t size) __cold __noreturn; void __read_overflow(void) __compiletime_error("detected read beyond size of object (1st parameter)"); void __read_overflow2(void) __compiletime_error("detected read beyond size of object (2nd parameter)"); void __read_overflow2_field(size_t avail, size_t wanted) __compiletime_warning("detected read beyond size of field (2nd parameter); maybe use struct_group()?"); void __write_overflow(void) __compiletime_error("detected write beyond size of object (1st parameter)"); void __write_overflow_field(size_t avail, size_t wanted) __compiletime_warning("detected write beyond size of field (1st parameter); maybe use struct_group()?"); #define __compiletime_strlen(p) \ ({ \ char *__p = (char *)(p); \ size_t __ret = SIZE_MAX; \ const size_t __p_size = __member_size(p); \ if (__p_size != SIZE_MAX && \ __builtin_constant_p(*__p)) { \ size_t __p_len = __p_size - 1; \ if (__builtin_constant_p(__p[__p_len]) && \ __p[__p_len] == '\0') \ __ret = __builtin_strlen(__p); \ } \ __ret; \ }) #if defined(__SANITIZE_ADDRESS__) #if !defined(CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX) && !defined(CONFIG_GENERIC_ENTRY) extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(memset); extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(memmove); extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy); #elif defined(CONFIG_KASAN_GENERIC) extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(__asan_memset); extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(__asan_memmove); extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(__asan_memcpy); #else /* CONFIG_KASAN_SW_TAGS */ extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(__hwasan_memset); extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(__hwasan_memmove); extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(__hwasan_memcpy); #endif extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr); extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp); extern char *__underlying_strcat(char *p, const char *q) __RENAME(strcat); extern char *__underlying_strcpy(char *p, const char *q) __RENAME(strcpy); extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen); extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat); extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy); #else #if defined(__SANITIZE_MEMORY__) /* * For KMSAN builds all memcpy/memset/memmove calls should be replaced by the * corresponding __msan_XXX functions. */ #include <linux/kmsan_string.h> #define __underlying_memcpy __msan_memcpy #define __underlying_memmove __msan_memmove #define __underlying_memset __msan_memset #else #define __underlying_memcpy __builtin_memcpy #define __underlying_memmove __builtin_memmove #define __underlying_memset __builtin_memset #endif #define __underlying_memchr __builtin_memchr #define __underlying_memcmp __builtin_memcmp #define __underlying_strcat __builtin_strcat #define __underlying_strcpy __builtin_strcpy #define __underlying_strlen __builtin_strlen #define __underlying_strncat __builtin_strncat #define __underlying_strncpy __builtin_strncpy #endif /** * unsafe_memcpy - memcpy implementation with no FORTIFY bounds checking * * @dst: Destination memory address to write to * @src: Source memory address to read from * @bytes: How many bytes to write to @dst from @src * @justification: Free-form text or comment describing why the use is needed * * This should be used for corner cases where the compiler cannot do the * right thing, or during transitions between APIs, etc. It should be used * very rarely, and includes a place for justification detailing where bounds * checking has happened, and why existing solutions cannot be employed. */ #define unsafe_memcpy(dst, src, bytes, justification) \ __underlying_memcpy(dst, src, bytes) /* * Clang's use of __builtin_*object_size() within inlines needs hinting via * __pass_*object_size(). The preference is to only ever use type 1 (member * size, rather than struct size), but there remain some stragglers using * type 0 that will be converted in the future. */ #if __has_builtin(__builtin_dynamic_object_size) #define POS __pass_dynamic_object_size(1) #define POS0 __pass_dynamic_object_size(0) #else #define POS __pass_object_size(1) #define POS0 __pass_object_size(0) #endif #define __compiletime_lessthan(bounds, length) ( \ __builtin_constant_p((bounds) < (length)) && \ (bounds) < (length) \ ) /** * strncpy - Copy a string to memory with non-guaranteed NUL padding * * @p: pointer to destination of copy * @q: pointer to NUL-terminated source string to copy * @size: bytes to write at @p * * If strlen(@q) >= @size, the copy of @q will stop after @size bytes, * and @p will NOT be NUL-terminated * * If strlen(@q) < @size, following the copy of @q, trailing NUL bytes * will be written to @p until @size total bytes have been written. * * Do not use this function. While FORTIFY_SOURCE tries to avoid * over-reads of @q, it cannot defend against writing unterminated * results to @p. Using strncpy() remains ambiguous and fragile. * Instead, please choose an alternative, so that the expectation * of @p's contents is unambiguous: * * +--------------------+--------------------+------------+ * | **p** needs to be: | padded to **size** | not padded | * +====================+====================+============+ * | NUL-terminated | strscpy_pad() | strscpy() | * +--------------------+--------------------+------------+ * | not NUL-terminated | strtomem_pad() | strtomem() | * +--------------------+--------------------+------------+ * * Note strscpy*()'s differing return values for detecting truncation, * and strtomem*()'s expectation that the destination is marked with * __nonstring when it is a character array. * */ __FORTIFY_INLINE __diagnose_as(__builtin_strncpy, 1, 2, 3) char *strncpy(char * const POS p, const char *q, __kernel_size_t size) { const size_t p_size = __member_size(p); if (__compiletime_lessthan(p_size, size)) __write_overflow(); if (p_size < size) fortify_panic(FORTIFY_FUNC_strncpy, FORTIFY_WRITE, p_size, size, p); return __underlying_strncpy(p, q, size); } extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen); /** * strnlen - Return bounded count of characters in a NUL-terminated string * * @p: pointer to NUL-terminated string to count. * @maxlen: maximum number of characters to count. * * Returns number of characters in @p (NOT including the final NUL), or * @maxlen, if no NUL has been found up to there. * */ __FORTIFY_INLINE __kernel_size_t strnlen(const char * const POS p, __kernel_size_t maxlen) { const size_t p_size = __member_size(p); const size_t p_len = __compiletime_strlen(p); size_t ret; /* We can take compile-time actions when maxlen is const. */ if (__builtin_constant_p(maxlen) && p_len != SIZE_MAX) { /* If p is const, we can use its compile-time-known len. */ if (maxlen >= p_size) return p_len; } /* Do not check characters beyond the end of p. */ ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size); if (p_size <= ret && maxlen != ret) fortify_panic(FORTIFY_FUNC_strnlen, FORTIFY_READ, p_size, ret + 1, ret); return ret; } /* * Defined after fortified strnlen to reuse it. However, it must still be * possible for strlen() to be used on compile-time strings for use in * static initializers (i.e. as a constant expression). */ /** * strlen - Return count of characters in a NUL-terminated string * * @p: pointer to NUL-terminated string to count. * * Do not use this function unless the string length is known at * compile-time. When @p is unterminated, this function may crash * or return unexpected counts that could lead to memory content * exposures. Prefer strnlen(). * * Returns number of characters in @p (NOT including the final NUL). * */ #define strlen(p) \ __builtin_choose_expr(__is_constexpr(__builtin_strlen(p)), \ __builtin_strlen(p), __fortify_strlen(p)) __FORTIFY_INLINE __diagnose_as(__builtin_strlen, 1) __kernel_size_t __fortify_strlen(const char * const POS p) { const size_t p_size = __member_size(p); __kernel_size_t ret; /* Give up if we don't know how large p is. */ if (p_size == SIZE_MAX) return __underlying_strlen(p); ret = strnlen(p, p_size); if (p_size <= ret) fortify_panic(FORTIFY_FUNC_strlen, FORTIFY_READ, p_size, ret + 1, ret); return ret; } /* Defined after fortified strnlen() to reuse it. */ extern ssize_t __real_strscpy(char *, const char *, size_t) __RENAME(sized_strscpy); __FORTIFY_INLINE ssize_t sized_strscpy(char * const POS p, const char * const POS q, size_t size) { /* Use string size rather than possible enclosing struct size. */ const size_t p_size = __member_size(p); const size_t q_size = __member_size(q); size_t len; /* If we cannot get size of p and q default to call strscpy. */ if (p_size == SIZE_MAX && q_size == SIZE_MAX) return __real_strscpy(p, q, size); /* * If size can be known at compile time and is greater than * p_size, generate a compile time write overflow error. */ if (__compiletime_lessthan(p_size, size)) __write_overflow(); /* Short-circuit for compile-time known-safe lengths. */ if (__compiletime_lessthan(p_size, SIZE_MAX)) { len = __compiletime_strlen(q); if (len < SIZE_MAX && __compiletime_lessthan(len, size)) { __underlying_memcpy(p, q, len + 1); return len; } } /* * This call protects from read overflow, because len will default to q * length if it smaller than size. */ len = strnlen(q, size); /* * If len equals size, we will copy only size bytes which leads to * -E2BIG being returned. * Otherwise we will copy len + 1 because of the final '\O'. */ len = len == size ? size : len + 1; /* * Generate a runtime write overflow error if len is greater than * p_size. */ if (p_size < len) fortify_panic(FORTIFY_FUNC_strscpy, FORTIFY_WRITE, p_size, len, -E2BIG); /* * We can now safely call vanilla strscpy because we are protected from: * 1. Read overflow thanks to call to strnlen(). * 2. Write overflow thanks to above ifs. */ return __real_strscpy(p, q, len); } /* Defined after fortified strlen() to reuse it. */ extern size_t __real_strlcat(char *p, const char *q, size_t avail) __RENAME(strlcat); /** * strlcat - Append a string to an existing string * * @p: pointer to %NUL-terminated string to append to * @q: pointer to %NUL-terminated string to append from * @avail: Maximum bytes available in @p * * Appends %NUL-terminated string @q after the %NUL-terminated * string at @p, but will not write beyond @avail bytes total, * potentially truncating the copy from @q. @p will stay * %NUL-terminated only if a %NUL already existed within * the @avail bytes of @p. If so, the resulting number of * bytes copied from @q will be at most "@avail - strlen(@p) - 1". * * Do not use this function. While FORTIFY_SOURCE tries to avoid * read and write overflows, this is only possible when the sizes * of @p and @q are known to the compiler. Prefer building the * string with formatting, via scnprintf(), seq_buf, or similar. * * Returns total bytes that _would_ have been contained by @p * regardless of truncation, similar to snprintf(). If return * value is >= @avail, the string has been truncated. * */ __FORTIFY_INLINE size_t strlcat(char * const POS p, const char * const POS q, size_t avail) { const size_t p_size = __member_size(p); const size_t q_size = __member_size(q); size_t p_len, copy_len; size_t actual, wanted; /* Give up immediately if both buffer sizes are unknown. */ if (p_size == SIZE_MAX && q_size == SIZE_MAX) return __real_strlcat(p, q, avail); p_len = strnlen(p, avail); copy_len = strlen(q); wanted = actual = p_len + copy_len; /* Cannot append any more: report truncation. */ if (avail <= p_len) return wanted; /* Give up if string is already overflowed. */ if (p_size <= p_len) fortify_panic(FORTIFY_FUNC_strlcat, FORTIFY_READ, p_size, p_len + 1, wanted); if (actual >= avail) { copy_len = avail - p_len - 1; actual = p_len + copy_len; } /* Give up if copy will overflow. */ if (p_size <= actual) fortify_panic(FORTIFY_FUNC_strlcat, FORTIFY_WRITE, p_size, actual + 1, wanted); __underlying_memcpy(p + p_len, q, copy_len); p[actual] = '\0'; return wanted; } /* Defined after fortified strlcat() to reuse it. */ /** * strcat - Append a string to an existing string * * @p: pointer to NUL-terminated string to append to * @q: pointer to NUL-terminated source string to append from * * Do not use this function. While FORTIFY_SOURCE tries to avoid * read and write overflows, this is only possible when the * destination buffer size is known to the compiler. Prefer * building the string with formatting, via scnprintf() or similar. * At the very least, use strncat(). * * Returns @p. * */ __FORTIFY_INLINE __diagnose_as(__builtin_strcat, 1, 2) char *strcat(char * const POS p, const char *q) { const size_t p_size = __member_size(p); const size_t wanted = strlcat(p, q, p_size); if (p_size <= wanted) fortify_panic(FORTIFY_FUNC_strcat, FORTIFY_WRITE, p_size, wanted + 1, p); return p; } /** * strncat - Append a string to an existing string * * @p: pointer to NUL-terminated string to append to * @q: pointer to source string to append from * @count: Maximum bytes to read from @q * * Appends at most @count bytes from @q (stopping at the first * NUL byte) after the NUL-terminated string at @p. @p will be * NUL-terminated. * * Do not use this function. While FORTIFY_SOURCE tries to avoid * read and write overflows, this is only possible when the sizes * of @p and @q are known to the compiler. Prefer building the * string with formatting, via scnprintf() or similar. * * Returns @p. * */ /* Defined after fortified strlen() and strnlen() to reuse them. */ __FORTIFY_INLINE __diagnose_as(__builtin_strncat, 1, 2, 3) char *strncat(char * const POS p, const char * const POS q, __kernel_size_t count) { const size_t p_size = __member_size(p); const size_t q_size = __member_size(q); size_t p_len, copy_len, total; if (p_size == SIZE_MAX && q_size == SIZE_MAX) return __underlying_strncat(p, q, count); p_len = strlen(p); copy_len = strnlen(q, count); total = p_len + copy_len + 1; if (p_size < total) fortify_panic(FORTIFY_FUNC_strncat, FORTIFY_WRITE, p_size, total, p); __underlying_memcpy(p + p_len, q, copy_len); p[p_len + copy_len] = '\0'; return p; } __FORTIFY_INLINE bool fortify_memset_chk(__kernel_size_t size, const size_t p_size, const size_t p_size_field) { if (__builtin_constant_p(size)) { /* * Length argument is a constant expression, so we * can perform compile-time bounds checking where * buffer sizes are also known at compile time. */ /* Error when size is larger than enclosing struct. */ if (__compiletime_lessthan(p_size_field, p_size) && __compiletime_lessthan(p_size, size)) __write_overflow(); /* Warn when write size is larger than dest field. */ if (__compiletime_lessthan(p_size_field, size)) __write_overflow_field(p_size_field, size); } /* * At this point, length argument may not be a constant expression, * so run-time bounds checking can be done where buffer sizes are * known. (This is not an "else" because the above checks may only * be compile-time warnings, and we want to still warn for run-time * overflows.) */ /* * Always stop accesses beyond the struct that contains the * field, when the buffer's remaining size is known. * (The SIZE_MAX test is to optimize away checks where the buffer * lengths are unknown.) */ if (p_size != SIZE_MAX && p_size < size) fortify_panic(FORTIFY_FUNC_memset, FORTIFY_WRITE, p_size, size, true); return false; } #define __fortify_memset_chk(p, c, size, p_size, p_size_field) ({ \ size_t __fortify_size = (size_t)(size); \ fortify_memset_chk(__fortify_size, p_size, p_size_field), \ __underlying_memset(p, c, __fortify_size); \ }) /* * __struct_size() vs __member_size() must be captured here to avoid * evaluating argument side-effects further into the macro layers. */ #ifndef CONFIG_KMSAN #define memset(p, c, s) __fortify_memset_chk(p, c, s, \ __struct_size(p), __member_size(p)) #endif /* * To make sure the compiler can enforce protection against buffer overflows, * memcpy(), memmove(), and memset() must not be used beyond individual * struct members. If you need to copy across multiple members, please use * struct_group() to create a named mirror of an anonymous struct union. * (e.g. see struct sk_buff.) Read overflow checking is currently only * done when a write overflow is also present, or when building with W=1. * * Mitigation coverage matrix * Bounds checking at: * +-------+-------+-------+-------+ * | Compile time | Run time | * memcpy() argument sizes: | write | read | write | read | * dest source length +-------+-------+-------+-------+ * memcpy(known, known, constant) | y | y | n/a | n/a | * memcpy(known, unknown, constant) | y | n | n/a | V | * memcpy(known, known, dynamic) | n | n | B | B | * memcpy(known, unknown, dynamic) | n | n | B | V | * memcpy(unknown, known, constant) | n | y | V | n/a | * memcpy(unknown, unknown, constant) | n | n | V | V | * memcpy(unknown, known, dynamic) | n | n | V | B | * memcpy(unknown, unknown, dynamic) | n | n | V | V | * +-------+-------+-------+-------+ * * y = perform deterministic compile-time bounds checking * n = cannot perform deterministic compile-time bounds checking * n/a = no run-time bounds checking needed since compile-time deterministic * B = can perform run-time bounds checking (currently unimplemented) * V = vulnerable to run-time overflow (will need refactoring to solve) * */ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, const size_t p_size, const size_t q_size, const size_t p_size_field, const size_t q_size_field, const u8 func) { if (__builtin_constant_p(size)) { /* * Length argument is a constant expression, so we * can perform compile-time bounds checking where * buffer sizes are also known at compile time. */ /* Error when size is larger than enclosing struct. */ if (__compiletime_lessthan(p_size_field, p_size) && __compiletime_lessthan(p_size, size)) __write_overflow(); if (__compiletime_lessthan(q_size_field, q_size) && __compiletime_lessthan(q_size, size)) __read_overflow2(); /* Warn when write size argument larger than dest field. */ if (__compiletime_lessthan(p_size_field, size)) __write_overflow_field(p_size_field, size); /* * Warn for source field over-read when building with W=1 * or when an over-write happened, so both can be fixed at * the same time. */ if ((IS_ENABLED(KBUILD_EXTRA_WARN1) || __compiletime_lessthan(p_size_field, size)) && __compiletime_lessthan(q_size_field, size)) __read_overflow2_field(q_size_field, size); } /* * At this point, length argument may not be a constant expression, * so run-time bounds checking can be done where buffer sizes are * known. (This is not an "else" because the above checks may only * be compile-time warnings, and we want to still warn for run-time * overflows.) */ /* * Always stop accesses beyond the struct that contains the * field, when the buffer's remaining size is known. * (The SIZE_MAX test is to optimize away checks where the buffer * lengths are unknown.) */ if (p_size != SIZE_MAX && p_size < size) fortify_panic(func, FORTIFY_WRITE, p_size, size, true); else if (q_size != SIZE_MAX && q_size < size) fortify_panic(func, FORTIFY_READ, q_size, size, true); /* * Warn when writing beyond destination field size. * * Note the implementation of __builtin_*object_size() behaves * like sizeof() when not directly referencing a flexible * array member, which means there will be many bounds checks * that will appear at run-time, without a way for them to be * detected at compile-time (as can be done when the destination * is specifically the flexible array member). * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101832 */ if (p_size_field != SIZE_MAX && p_size != p_size_field && p_size_field < size) return true; return false; } /* * To work around what seems to be an optimizer bug, the macro arguments * need to have const copies or the values end up changed by the time they * reach fortify_warn_once(). See commit 6f7630b1b5bc ("fortify: Capture * __bos() results in const temp vars") for more details. */ #define __fortify_memcpy_chk(p, q, size, p_size, q_size, \ p_size_field, q_size_field, op) ({ \ const size_t __fortify_size = (size_t)(size); \ const size_t __p_size = (p_size); \ const size_t __q_size = (q_size); \ const size_t __p_size_field = (p_size_field); \ const size_t __q_size_field = (q_size_field); \ /* Keep a mutable version of the size for the final copy. */ \ size_t __copy_size = __fortify_size; \ fortify_warn_once(fortify_memcpy_chk(__fortify_size, __p_size, \ __q_size, __p_size_field, \ __q_size_field, FORTIFY_FUNC_ ##op), \ #op ": detected field-spanning write (size %zu) of single %s (size %zu)\n", \ __fortify_size, \ "field \"" #p "\" at " FILE_LINE, \ __p_size_field); \ /* Hide only the run-time size from value range tracking to */ \ /* silence compile-time false positive bounds warnings. */ \ if (!__builtin_constant_p(__copy_size)) \ OPTIMIZER_HIDE_VAR(__copy_size); \ __underlying_##op(p, q, __copy_size); \ }) /* * Notes about compile-time buffer size detection: * * With these types... * * struct middle { * u16 a; * u8 middle_buf[16]; * int b; * }; * struct end { * u16 a; * u8 end_buf[16]; * }; * struct flex { * int a; * u8 flex_buf[]; * }; * * void func(TYPE *ptr) { ... } * * Cases where destination size cannot be currently detected: * - the size of ptr's object (seemingly by design, gcc & clang fail): * __builtin_object_size(ptr, 1) == SIZE_MAX * - the size of flexible arrays in ptr's obj (by design, dynamic size): * __builtin_object_size(ptr->flex_buf, 1) == SIZE_MAX * - the size of ANY array at the end of ptr's obj (gcc and clang bug): * __builtin_object_size(ptr->end_buf, 1) == SIZE_MAX * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101836 * * Cases where destination size is currently detected: * - the size of non-array members within ptr's object: * __builtin_object_size(ptr->a, 1) == 2 * - the size of non-flexible-array in the middle of ptr's obj: * __builtin_object_size(ptr->middle_buf, 1) == 16 * */ /* * __struct_size() vs __member_size() must be captured here to avoid * evaluating argument side-effects further into the macro layers. */ #define memcpy(p, q, s) __fortify_memcpy_chk(p, q, s, \ __struct_size(p), __struct_size(q), \ __member_size(p), __member_size(q), \ memcpy) #define memmove(p, q, s) __fortify_memcpy_chk(p, q, s, \ __struct_size(p), __struct_size(q), \ __member_size(p), __member_size(q), \ memmove) extern void *__real_memscan(void *, int, __kernel_size_t) __RENAME(memscan); __FORTIFY_INLINE void *memscan(void * const POS0 p, int c, __kernel_size_t size) { const size_t p_size = __struct_size(p); if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) fortify_panic(FORTIFY_FUNC_memscan, FORTIFY_READ, p_size, size, NULL); return __real_memscan(p, c, size); } __FORTIFY_INLINE __diagnose_as(__builtin_memcmp, 1, 2, 3) int memcmp(const void * const POS0 p, const void * const POS0 q, __kernel_size_t size) { const size_t p_size = __struct_size(p); const size_t q_size = __struct_size(q); if (__builtin_constant_p(size)) { if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (__compiletime_lessthan(q_size, size)) __read_overflow2(); } if (p_size < size) fortify_panic(FORTIFY_FUNC_memcmp, FORTIFY_READ, p_size, size, INT_MIN); else if (q_size < size) fortify_panic(FORTIFY_FUNC_memcmp, FORTIFY_READ, q_size, size, INT_MIN); return __underlying_memcmp(p, q, size); } __FORTIFY_INLINE __diagnose_as(__builtin_memchr, 1, 2, 3) void *memchr(const void * const POS0 p, int c, __kernel_size_t size) { const size_t p_size = __struct_size(p); if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) fortify_panic(FORTIFY_FUNC_memchr, FORTIFY_READ, p_size, size, NULL); return __underlying_memchr(p, c, size); } void *__real_memchr_inv(const void *s, int c, size_t n) __RENAME(memchr_inv); __FORTIFY_INLINE void *memchr_inv(const void * const POS0 p, int c, size_t size) { const size_t p_size = __struct_size(p); if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) fortify_panic(FORTIFY_FUNC_memchr_inv, FORTIFY_READ, p_size, size, NULL); return __real_memchr_inv(p, c, size); } extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup_noprof) __realloc_size(2); __FORTIFY_INLINE void *kmemdup_noprof(const void * const POS0 p, size_t size, gfp_t gfp) { const size_t p_size = __struct_size(p); if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) fortify_panic(FORTIFY_FUNC_kmemdup, FORTIFY_READ, p_size, size, __real_kmemdup(p, 0, gfp)); return __real_kmemdup(p, size, gfp); } #define kmemdup(...) alloc_hooks(kmemdup_noprof(__VA_ARGS__)) /** * strcpy - Copy a string into another string buffer * * @p: pointer to destination of copy * @q: pointer to NUL-terminated source string to copy * * Do not use this function. While FORTIFY_SOURCE tries to avoid * overflows, this is only possible when the sizes of @q and @p are * known to the compiler. Prefer strscpy(), though note its different * return values for detecting truncation. * * Returns @p. * */ /* Defined after fortified strlen to reuse it. */ __FORTIFY_INLINE __diagnose_as(__builtin_strcpy, 1, 2) char *strcpy(char * const POS p, const char * const POS q) { const size_t p_size = __member_size(p); const size_t q_size = __member_size(q); size_t size; /* If neither buffer size is known, immediately give up. */ if (__builtin_constant_p(p_size) && __builtin_constant_p(q_size) && p_size == SIZE_MAX && q_size == SIZE_MAX) return __underlying_strcpy(p, q); size = strlen(q) + 1; /* Compile-time check for const size overflow. */ if (__compiletime_lessthan(p_size, size)) __write_overflow(); /* Run-time check for dynamic size overflow. */ if (p_size < size) fortify_panic(FORTIFY_FUNC_strcpy, FORTIFY_WRITE, p_size, size, p); __underlying_memcpy(p, q, size); return p; } /* Don't use these outside the FORITFY_SOURCE implementation */ #undef __underlying_memchr #undef __underlying_memcmp #undef __underlying_strcat #undef __underlying_strcpy #undef __underlying_strlen #undef __underlying_strncat #undef __underlying_strncpy #undef POS #undef POS0 #endif /* _LINUX_FORTIFY_STRING_H_ */
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 /* SPDX-License-Identifier: GPL-2.0 */ /* * QNX6 file system, Linux implementation. * * Version : 1.0.0 * * History : * * 01-02-2012 by Kai Bankett (chaosman@ontika.net) : first release. * 16-02-2012 page map extension by Al Viro * */ #ifdef pr_fmt #undef pr_fmt #endif #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/fs.h> #include <linux/pagemap.h> typedef __u16 __bitwise __fs16; typedef __u32 __bitwise __fs32; typedef __u64 __bitwise __fs64; #include <linux/qnx6_fs.h> struct qnx6_sb_info { struct buffer_head *sb_buf; /* superblock buffer */ struct qnx6_super_block *sb; /* our superblock */ int s_blks_off; /* blkoffset fs-startpoint */ int s_ptrbits; /* indirect pointer bitfield */ unsigned long s_mount_opt; /* all mount options */ int s_bytesex; /* holds endianess info */ struct inode * inodes; struct inode * longfile; }; struct qnx6_inode_info { __fs32 di_block_ptr[QNX6_NO_DIRECT_POINTERS]; __u8 di_filelevels; __u32 i_dir_start_lookup; struct inode vfs_inode; }; extern struct inode *qnx6_iget(struct super_block *sb, unsigned ino); extern struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); #ifdef CONFIG_QNX6FS_DEBUG extern void qnx6_superblock_debug(struct qnx6_super_block *, struct super_block *); #endif extern const struct inode_operations qnx6_dir_inode_operations; extern const struct file_operations qnx6_dir_operations; static inline struct qnx6_sb_info *QNX6_SB(struct super_block *sb) { return sb->s_fs_info; } static inline struct qnx6_inode_info *QNX6_I(struct inode *inode) { return container_of(inode, struct qnx6_inode_info, vfs_inode); } #define clear_opt(o, opt) (o &= ~(QNX6_MOUNT_##opt)) #define set_opt(o, opt) (o |= (QNX6_MOUNT_##opt)) #define test_opt(sb, opt) (QNX6_SB(sb)->s_mount_opt & \ QNX6_MOUNT_##opt) enum { BYTESEX_LE, BYTESEX_BE, }; static inline __u64 fs64_to_cpu(struct qnx6_sb_info *sbi, __fs64 n) { if (sbi->s_bytesex == BYTESEX_LE) return le64_to_cpu((__force __le64)n); else return be64_to_cpu((__force __be64)n); } static inline __fs64 cpu_to_fs64(struct qnx6_sb_info *sbi, __u64 n) { if (sbi->s_bytesex == BYTESEX_LE) return (__force __fs64)cpu_to_le64(n); else return (__force __fs64)cpu_to_be64(n); } static inline __u32 fs32_to_cpu(struct qnx6_sb_info *sbi, __fs32 n) { if (sbi->s_bytesex == BYTESEX_LE) return le32_to_cpu((__force __le32)n); else return be32_to_cpu((__force __be32)n); } static inline __fs32 cpu_to_fs32(struct qnx6_sb_info *sbi, __u32 n) { if (sbi->s_bytesex == BYTESEX_LE) return (__force __fs32)cpu_to_le32(n); else return (__force __fs32)cpu_to_be32(n); } static inline __u16 fs16_to_cpu(struct qnx6_sb_info *sbi, __fs16 n) { if (sbi->s_bytesex == BYTESEX_LE) return le16_to_cpu((__force __le16)n); else return be16_to_cpu((__force __be16)n); } static inline __fs16 cpu_to_fs16(struct qnx6_sb_info *sbi, __u16 n) { if (sbi->s_bytesex == BYTESEX_LE) return (__force __fs16)cpu_to_le16(n); else return (__force __fs16)cpu_to_be16(n); } extern struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent); unsigned qnx6_find_ino(int len, struct inode *dir, const char *name);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 /* SPDX-License-Identifier: GPL-2.0 */ /* * * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. * * on-disk ntfs structs */ // clang-format off #ifndef _LINUX_NTFS3_NTFS_H #define _LINUX_NTFS3_NTFS_H #include <linux/blkdev.h> #include <linux/build_bug.h> #include <linux/kernel.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> #include "debug.h" /* TODO: Check 4K MFT record and 512 bytes cluster. */ /* Check each run for marked clusters. */ #define NTFS3_CHECK_FREE_CLST #define NTFS_NAME_LEN 255 /* * ntfs.sys used 500 maximum links on-disk struct allows up to 0xffff. * xfstest generic/041 creates 3003 hardlinks. */ #define NTFS_LINK_MAX 4000 /* * Activate to use 64 bit clusters instead of 32 bits in ntfs.sys. * Logical and virtual cluster number if needed, may be * redefined to use 64 bit value. */ //#define CONFIG_NTFS3_64BIT_CLUSTER #define NTFS_LZNT_MAX_CLUSTER 4096 #define NTFS_LZNT_CUNIT 4 #define NTFS_LZNT_CLUSTERS (1u<<NTFS_LZNT_CUNIT) struct GUID { __le32 Data1; __le16 Data2; __le16 Data3; u8 Data4[8]; }; /* * This struct repeats layout of ATTR_FILE_NAME * at offset 0x40. * It used to store global constants NAME_MFT/NAME_MIRROR... * most constant names are shorter than 10. */ struct cpu_str { u8 len; u8 unused; u16 name[]; }; struct le_str { u8 len; u8 unused; __le16 name[]; }; static_assert(SECTOR_SHIFT == 9); #ifdef CONFIG_NTFS3_64BIT_CLUSTER typedef u64 CLST; static_assert(sizeof(size_t) == 8); #else typedef u32 CLST; #endif #define SPARSE_LCN64 ((u64)-1) #define SPARSE_LCN ((CLST)-1) #define RESIDENT_LCN ((CLST)-2) #define COMPRESSED_LCN ((CLST)-3) enum RECORD_NUM { MFT_REC_MFT = 0, MFT_REC_MIRR = 1, MFT_REC_LOG = 2, MFT_REC_VOL = 3, MFT_REC_ATTR = 4, MFT_REC_ROOT = 5, MFT_REC_BITMAP = 6, MFT_REC_BOOT = 7, MFT_REC_BADCLUST = 8, MFT_REC_SECURE = 9, MFT_REC_UPCASE = 10, MFT_REC_EXTEND = 11, MFT_REC_RESERVED = 12, MFT_REC_FREE = 16, MFT_REC_USER = 24, }; enum ATTR_TYPE { ATTR_ZERO = cpu_to_le32(0x00), ATTR_STD = cpu_to_le32(0x10), ATTR_LIST = cpu_to_le32(0x20), ATTR_NAME = cpu_to_le32(0x30), ATTR_ID = cpu_to_le32(0x40), ATTR_SECURE = cpu_to_le32(0x50), ATTR_LABEL = cpu_to_le32(0x60), ATTR_VOL_INFO = cpu_to_le32(0x70), ATTR_DATA = cpu_to_le32(0x80), ATTR_ROOT = cpu_to_le32(0x90), ATTR_ALLOC = cpu_to_le32(0xA0), ATTR_BITMAP = cpu_to_le32(0xB0), ATTR_REPARSE = cpu_to_le32(0xC0), ATTR_EA_INFO = cpu_to_le32(0xD0), ATTR_EA = cpu_to_le32(0xE0), ATTR_PROPERTYSET = cpu_to_le32(0xF0), ATTR_LOGGED_UTILITY_STREAM = cpu_to_le32(0x100), ATTR_END = cpu_to_le32(0xFFFFFFFF) }; static_assert(sizeof(enum ATTR_TYPE) == 4); enum FILE_ATTRIBUTE { FILE_ATTRIBUTE_READONLY = cpu_to_le32(0x00000001), FILE_ATTRIBUTE_HIDDEN = cpu_to_le32(0x00000002), FILE_ATTRIBUTE_SYSTEM = cpu_to_le32(0x00000004), FILE_ATTRIBUTE_ARCHIVE = cpu_to_le32(0x00000020), FILE_ATTRIBUTE_DEVICE = cpu_to_le32(0x00000040), FILE_ATTRIBUTE_TEMPORARY = cpu_to_le32(0x00000100), FILE_ATTRIBUTE_SPARSE_FILE = cpu_to_le32(0x00000200), FILE_ATTRIBUTE_REPARSE_POINT = cpu_to_le32(0x00000400), FILE_ATTRIBUTE_COMPRESSED = cpu_to_le32(0x00000800), FILE_ATTRIBUTE_OFFLINE = cpu_to_le32(0x00001000), FILE_ATTRIBUTE_NOT_CONTENT_INDEXED = cpu_to_le32(0x00002000), FILE_ATTRIBUTE_ENCRYPTED = cpu_to_le32(0x00004000), FILE_ATTRIBUTE_VALID_FLAGS = cpu_to_le32(0x00007fb7), FILE_ATTRIBUTE_DIRECTORY = cpu_to_le32(0x10000000), FILE_ATTRIBUTE_INDEX = cpu_to_le32(0x20000000) }; static_assert(sizeof(enum FILE_ATTRIBUTE) == 4); extern const struct cpu_str NAME_MFT; extern const struct cpu_str NAME_MIRROR; extern const struct cpu_str NAME_LOGFILE; extern const struct cpu_str NAME_VOLUME; extern const struct cpu_str NAME_ATTRDEF; extern const struct cpu_str NAME_ROOT; extern const struct cpu_str NAME_BITMAP; extern const struct cpu_str NAME_BOOT; extern const struct cpu_str NAME_BADCLUS; extern const struct cpu_str NAME_QUOTA; extern const struct cpu_str NAME_SECURE; extern const struct cpu_str NAME_UPCASE; extern const struct cpu_str NAME_EXTEND; extern const struct cpu_str NAME_OBJID; extern const struct cpu_str NAME_REPARSE; extern const struct cpu_str NAME_USNJRNL; extern const __le16 I30_NAME[4]; extern const __le16 SII_NAME[4]; extern const __le16 SDH_NAME[4]; extern const __le16 SO_NAME[2]; extern const __le16 SQ_NAME[2]; extern const __le16 SR_NAME[2]; extern const __le16 BAD_NAME[4]; extern const __le16 SDS_NAME[4]; extern const __le16 WOF_NAME[17]; /* WofCompressedData */ /* MFT record number structure. */ struct MFT_REF { __le32 low; // The low part of the number. __le16 high; // The high part of the number. __le16 seq; // The sequence number of MFT record. }; static_assert(sizeof(__le64) == sizeof(struct MFT_REF)); static inline CLST ino_get(const struct MFT_REF *ref) { #ifdef CONFIG_NTFS3_64BIT_CLUSTER return le32_to_cpu(ref->low) | ((u64)le16_to_cpu(ref->high) << 32); #else return le32_to_cpu(ref->low); #endif } struct NTFS_BOOT { u8 jump_code[3]; // 0x00: Jump to boot code. u8 system_id[8]; // 0x03: System ID, equals "NTFS " // NOTE: This member is not aligned(!) // bytes_per_sector[0] must be 0. // bytes_per_sector[1] must be multiplied by 256. u8 bytes_per_sector[2]; // 0x0B: Bytes per sector. u8 sectors_per_clusters;// 0x0D: Sectors per cluster. u8 unused1[7]; u8 media_type; // 0x15: Media type (0xF8 - harddisk) u8 unused2[2]; __le16 sct_per_track; // 0x18: number of sectors per track. __le16 heads; // 0x1A: number of heads per cylinder. __le32 hidden_sectors; // 0x1C: number of 'hidden' sectors. u8 unused3[4]; u8 bios_drive_num; // 0x24: BIOS drive number =0x80. u8 unused4; u8 signature_ex; // 0x26: Extended BOOT signature =0x80. u8 unused5; __le64 sectors_per_volume;// 0x28: Size of volume in sectors. __le64 mft_clst; // 0x30: First cluster of $MFT __le64 mft2_clst; // 0x38: First cluster of $MFTMirr s8 record_size; // 0x40: Size of MFT record in clusters(sectors). u8 unused6[3]; s8 index_size; // 0x44: Size of INDX record in clusters(sectors). u8 unused7[3]; __le64 serial_num; // 0x48: Volume serial number __le32 check_sum; // 0x50: Simple additive checksum of all // of the u32's which precede the 'check_sum'. u8 boot_code[0x200 - 0x50 - 2 - 4]; // 0x54: u8 boot_magic[2]; // 0x1FE: Boot signature =0x55 + 0xAA }; static_assert(sizeof(struct NTFS_BOOT) == 0x200); enum NTFS_SIGNATURE { NTFS_FILE_SIGNATURE = cpu_to_le32(0x454C4946), // 'FILE' NTFS_INDX_SIGNATURE = cpu_to_le32(0x58444E49), // 'INDX' NTFS_CHKD_SIGNATURE = cpu_to_le32(0x444B4843), // 'CHKD' NTFS_RSTR_SIGNATURE = cpu_to_le32(0x52545352), // 'RSTR' NTFS_RCRD_SIGNATURE = cpu_to_le32(0x44524352), // 'RCRD' NTFS_BAAD_SIGNATURE = cpu_to_le32(0x44414142), // 'BAAD' NTFS_HOLE_SIGNATURE = cpu_to_le32(0x454C4F48), // 'HOLE' NTFS_FFFF_SIGNATURE = cpu_to_le32(0xffffffff), }; static_assert(sizeof(enum NTFS_SIGNATURE) == 4); /* MFT Record header structure. */ struct NTFS_RECORD_HEADER { /* Record magic number, equals 'FILE'/'INDX'/'RSTR'/'RCRD'. */ enum NTFS_SIGNATURE sign; // 0x00: __le16 fix_off; // 0x04: __le16 fix_num; // 0x06: __le64 lsn; // 0x08: Log file sequence number, }; static_assert(sizeof(struct NTFS_RECORD_HEADER) == 0x10); static inline int is_baad(const struct NTFS_RECORD_HEADER *hdr) { return hdr->sign == NTFS_BAAD_SIGNATURE; } /* Possible bits in struct MFT_REC.flags. */ enum RECORD_FLAG { RECORD_FLAG_IN_USE = cpu_to_le16(0x0001), RECORD_FLAG_DIR = cpu_to_le16(0x0002), RECORD_FLAG_SYSTEM = cpu_to_le16(0x0004), RECORD_FLAG_INDEX = cpu_to_le16(0x0008), }; /* MFT Record structure. */ struct MFT_REC { struct NTFS_RECORD_HEADER rhdr; // 'FILE' __le16 seq; // 0x10: Sequence number for this record. __le16 hard_links; // 0x12: The number of hard links to record. __le16 attr_off; // 0x14: Offset to attributes. __le16 flags; // 0x16: See RECORD_FLAG. __le32 used; // 0x18: The size of used part. __le32 total; // 0x1C: Total record size. struct MFT_REF parent_ref; // 0x20: Parent MFT record. __le16 next_attr_id; // 0x28: The next attribute Id. __le16 res; // 0x2A: High part of MFT record? __le32 mft_record; // 0x2C: Current MFT record number. __le16 fixups[]; // 0x30: }; #define MFTRECORD_FIXUP_OFFSET_1 offsetof(struct MFT_REC, res) #define MFTRECORD_FIXUP_OFFSET_3 offsetof(struct MFT_REC, fixups) /* * define MFTRECORD_FIXUP_OFFSET as MFTRECORD_FIXUP_OFFSET_3 (0x30) * to format new mft records with bigger header (as current ntfs.sys does) * * define MFTRECORD_FIXUP_OFFSET as MFTRECORD_FIXUP_OFFSET_1 (0x2A) * to format new mft records with smaller header (as old ntfs.sys did) * Both variants are valid. */ #define MFTRECORD_FIXUP_OFFSET MFTRECORD_FIXUP_OFFSET_1 static_assert(MFTRECORD_FIXUP_OFFSET_1 == 0x2A); static_assert(MFTRECORD_FIXUP_OFFSET_3 == 0x30); static inline bool is_rec_base(const struct MFT_REC *rec) { const struct MFT_REF *r = &rec->parent_ref; return !r->low && !r->high && !r->seq; } static inline bool is_mft_rec5(const struct MFT_REC *rec) { return le16_to_cpu(rec->rhdr.fix_off) >= offsetof(struct MFT_REC, fixups); } static inline bool is_rec_inuse(const struct MFT_REC *rec) { return rec->flags & RECORD_FLAG_IN_USE; } static inline bool clear_rec_inuse(struct MFT_REC *rec) { return rec->flags &= ~RECORD_FLAG_IN_USE; } /* Possible values of ATTR_RESIDENT.flags */ #define RESIDENT_FLAG_INDEXED 0x01 struct ATTR_RESIDENT { __le32 data_size; // 0x10: The size of data. __le16 data_off; // 0x14: Offset to data. u8 flags; // 0x16: Resident flags ( 1 - indexed ). u8 res; // 0x17: }; // sizeof() = 0x18 struct ATTR_NONRESIDENT { __le64 svcn; // 0x10: Starting VCN of this segment. __le64 evcn; // 0x18: End VCN of this segment. __le16 run_off; // 0x20: Offset to packed runs. // Unit of Compression size for this stream, expressed // as a log of the cluster size. // // 0 means file is not compressed // 1, 2, 3, and 4 are potentially legal values if the // stream is compressed, however the implementation // may only choose to use 4, or possibly 3. // Note that 4 means cluster size time 16. // If convenient the implementation may wish to accept a // reasonable range of legal values here (1-5?), // even if the implementation only generates // a smaller set of values itself. u8 c_unit; // 0x22: u8 res1[5]; // 0x23: __le64 alloc_size; // 0x28: The allocated size of attribute in bytes. // (multiple of cluster size) __le64 data_size; // 0x30: The size of attribute in bytes <= alloc_size. __le64 valid_size; // 0x38: The size of valid part in bytes <= data_size. __le64 total_size; // 0x40: The sum of the allocated clusters for a file. // (present only for the first segment (0 == vcn) // of compressed attribute) }; // sizeof()=0x40 or 0x48 (if compressed) /* Possible values of ATTRIB.flags: */ #define ATTR_FLAG_COMPRESSED cpu_to_le16(0x0001) #define ATTR_FLAG_COMPRESSED_MASK cpu_to_le16(0x00FF) #define ATTR_FLAG_ENCRYPTED cpu_to_le16(0x4000) #define ATTR_FLAG_SPARSED cpu_to_le16(0x8000) struct ATTRIB { enum ATTR_TYPE type; // 0x00: The type of this attribute. __le32 size; // 0x04: The size of this attribute. u8 non_res; // 0x08: Is this attribute non-resident? u8 name_len; // 0x09: This attribute name length. __le16 name_off; // 0x0A: Offset to the attribute name. __le16 flags; // 0x0C: See ATTR_FLAG_XXX. __le16 id; // 0x0E: Unique id (per record). union { struct ATTR_RESIDENT res; // 0x10 struct ATTR_NONRESIDENT nres; // 0x10 }; }; /* Define attribute sizes. */ #define SIZEOF_RESIDENT 0x18 #define SIZEOF_NONRESIDENT_EX 0x48 #define SIZEOF_NONRESIDENT 0x40 #define SIZEOF_RESIDENT_LE cpu_to_le16(0x18) #define SIZEOF_NONRESIDENT_EX_LE cpu_to_le16(0x48) #define SIZEOF_NONRESIDENT_LE cpu_to_le16(0x40) static inline u64 attr_ondisk_size(const struct ATTRIB *attr) { return attr->non_res ? ((attr->flags & (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED)) ? le64_to_cpu(attr->nres.total_size) : le64_to_cpu(attr->nres.alloc_size)) : ALIGN(le32_to_cpu(attr->res.data_size), 8); } static inline u64 attr_size(const struct ATTRIB *attr) { return attr->non_res ? le64_to_cpu(attr->nres.data_size) : le32_to_cpu(attr->res.data_size); } static inline bool is_attr_encrypted(const struct ATTRIB *attr) { return attr->flags & ATTR_FLAG_ENCRYPTED; } static inline bool is_attr_sparsed(const struct ATTRIB *attr) { return attr->flags & ATTR_FLAG_SPARSED; } static inline bool is_attr_compressed(const struct ATTRIB *attr) { return attr->flags & ATTR_FLAG_COMPRESSED; } static inline bool is_attr_ext(const struct ATTRIB *attr) { return attr->flags & (ATTR_FLAG_SPARSED | ATTR_FLAG_COMPRESSED); } static inline bool is_attr_indexed(const struct ATTRIB *attr) { return !attr->non_res && (attr->res.flags & RESIDENT_FLAG_INDEXED); } static inline __le16 const *attr_name(const struct ATTRIB *attr) { return Add2Ptr(attr, le16_to_cpu(attr->name_off)); } static inline u64 attr_svcn(const struct ATTRIB *attr) { return attr->non_res ? le64_to_cpu(attr->nres.svcn) : 0; } static_assert(sizeof(struct ATTRIB) == 0x48); static_assert(sizeof(((struct ATTRIB *)NULL)->res) == 0x08); static_assert(sizeof(((struct ATTRIB *)NULL)->nres) == 0x38); static inline void *resident_data_ex(const struct ATTRIB *attr, u32 datasize) { u32 asize, rsize; u16 off; if (attr->non_res) return NULL; asize = le32_to_cpu(attr->size); off = le16_to_cpu(attr->res.data_off); if (asize < datasize + off) return NULL; rsize = le32_to_cpu(attr->res.data_size); if (rsize < datasize) return NULL; return Add2Ptr(attr, off); } static inline void *resident_data(const struct ATTRIB *attr) { return Add2Ptr(attr, le16_to_cpu(attr->res.data_off)); } static inline void *attr_run(const struct ATTRIB *attr) { return Add2Ptr(attr, le16_to_cpu(attr->nres.run_off)); } /* Standard information attribute (0x10). */ struct ATTR_STD_INFO { __le64 cr_time; // 0x00: File creation file. __le64 m_time; // 0x08: File modification time. __le64 c_time; // 0x10: Last time any attribute was modified. __le64 a_time; // 0x18: File last access time. enum FILE_ATTRIBUTE fa; // 0x20: Standard DOS attributes & more. __le32 max_ver_num; // 0x24: Maximum Number of Versions. __le32 ver_num; // 0x28: Version Number. __le32 class_id; // 0x2C: Class Id from bidirectional Class Id index. }; static_assert(sizeof(struct ATTR_STD_INFO) == 0x30); #define SECURITY_ID_INVALID 0x00000000 #define SECURITY_ID_FIRST 0x00000100 struct ATTR_STD_INFO5 { __le64 cr_time; // 0x00: File creation file. __le64 m_time; // 0x08: File modification time. __le64 c_time; // 0x10: Last time any attribute was modified. __le64 a_time; // 0x18: File last access time. enum FILE_ATTRIBUTE fa; // 0x20: Standard DOS attributes & more. __le32 max_ver_num; // 0x24: Maximum Number of Versions. __le32 ver_num; // 0x28: Version Number. __le32 class_id; // 0x2C: Class Id from bidirectional Class Id index. __le32 owner_id; // 0x30: Owner Id of the user owning the file. __le32 security_id; // 0x34: The Security Id is a key in the $SII Index and $SDS. __le64 quota_charge; // 0x38: __le64 usn; // 0x40: Last Update Sequence Number of the file. This is a direct // index into the file $UsnJrnl. If zero, the USN Journal is // disabled. }; static_assert(sizeof(struct ATTR_STD_INFO5) == 0x48); /* Attribute list entry structure (0x20) */ struct ATTR_LIST_ENTRY { enum ATTR_TYPE type; // 0x00: The type of attribute. __le16 size; // 0x04: The size of this record. u8 name_len; // 0x06: The length of attribute name. u8 name_off; // 0x07: The offset to attribute name. __le64 vcn; // 0x08: Starting VCN of this attribute. struct MFT_REF ref; // 0x10: MFT record number with attribute. __le16 id; // 0x18: struct ATTRIB ID. __le16 name[]; // 0x1A: To get real name use name_off. }; // sizeof(0x20) static inline u32 le_size(u8 name_len) { return ALIGN(offsetof(struct ATTR_LIST_ENTRY, name) + name_len * sizeof(short), 8); } /* Returns 0 if 'attr' has the same type and name. */ static inline int le_cmp(const struct ATTR_LIST_ENTRY *le, const struct ATTRIB *attr) { return le->type != attr->type || le->name_len != attr->name_len || (!le->name_len && memcmp(Add2Ptr(le, le->name_off), Add2Ptr(attr, le16_to_cpu(attr->name_off)), le->name_len * sizeof(short))); } static inline __le16 const *le_name(const struct ATTR_LIST_ENTRY *le) { return Add2Ptr(le, le->name_off); } /* File name types (the field type in struct ATTR_FILE_NAME). */ #define FILE_NAME_POSIX 0 #define FILE_NAME_UNICODE 1 #define FILE_NAME_DOS 2 #define FILE_NAME_UNICODE_AND_DOS (FILE_NAME_DOS | FILE_NAME_UNICODE) /* Filename attribute structure (0x30). */ struct NTFS_DUP_INFO { __le64 cr_time; // 0x00: File creation file. __le64 m_time; // 0x08: File modification time. __le64 c_time; // 0x10: Last time any attribute was modified. __le64 a_time; // 0x18: File last access time. __le64 alloc_size; // 0x20: Data attribute allocated size, multiple of cluster size. __le64 data_size; // 0x28: Data attribute size <= Dataalloc_size. enum FILE_ATTRIBUTE fa; // 0x30: Standard DOS attributes & more. __le32 extend_data; // 0x34: Extended data. }; // 0x38 struct ATTR_FILE_NAME { struct MFT_REF home; // 0x00: MFT record for directory. struct NTFS_DUP_INFO dup;// 0x08: u8 name_len; // 0x40: File name length in words. u8 type; // 0x41: File name type. __le16 name[]; // 0x42: File name. }; static_assert(sizeof(((struct ATTR_FILE_NAME *)NULL)->dup) == 0x38); static_assert(offsetof(struct ATTR_FILE_NAME, name) == 0x42); #define SIZEOF_ATTRIBUTE_FILENAME 0x44 #define SIZEOF_ATTRIBUTE_FILENAME_MAX (0x42 + 255 * 2) static inline struct ATTRIB *attr_from_name(struct ATTR_FILE_NAME *fname) { return (struct ATTRIB *)((char *)fname - SIZEOF_RESIDENT); } static inline u16 fname_full_size(const struct ATTR_FILE_NAME *fname) { /* Don't return struct_size(fname, name, fname->name_len); */ return offsetof(struct ATTR_FILE_NAME, name) + fname->name_len * sizeof(short); } static inline u8 paired_name(u8 type) { if (type == FILE_NAME_UNICODE) return FILE_NAME_DOS; if (type == FILE_NAME_DOS) return FILE_NAME_UNICODE; return FILE_NAME_POSIX; } /* Index entry defines ( the field flags in NtfsDirEntry ). */ #define NTFS_IE_HAS_SUBNODES cpu_to_le16(1) #define NTFS_IE_LAST cpu_to_le16(2) /* Directory entry structure. */ struct NTFS_DE { union { struct MFT_REF ref; // 0x00: MFT record number with this file. struct { __le16 data_off; // 0x00: __le16 data_size; // 0x02: __le32 res; // 0x04: Must be 0. } view; }; __le16 size; // 0x08: The size of this entry. __le16 key_size; // 0x0A: The size of File name length in bytes + 0x42. __le16 flags; // 0x0C: Entry flags: NTFS_IE_XXX. __le16 res; // 0x0E: // Here any indexed attribute can be placed. // One of them is: // struct ATTR_FILE_NAME AttrFileName; // // The last 8 bytes of this structure contains // the VBN of subnode. // !!! Note !!! // This field is presented only if (flags & NTFS_IE_HAS_SUBNODES) // __le64 vbn; }; static_assert(sizeof(struct NTFS_DE) == 0x10); static inline void de_set_vbn_le(struct NTFS_DE *e, __le64 vcn) { __le64 *v = Add2Ptr(e, le16_to_cpu(e->size) - sizeof(__le64)); *v = vcn; } static inline void de_set_vbn(struct NTFS_DE *e, CLST vcn) { __le64 *v = Add2Ptr(e, le16_to_cpu(e->size) - sizeof(__le64)); *v = cpu_to_le64(vcn); } static inline __le64 de_get_vbn_le(const struct NTFS_DE *e) { return *(__le64 *)Add2Ptr(e, le16_to_cpu(e->size) - sizeof(__le64)); } static inline CLST de_get_vbn(const struct NTFS_DE *e) { __le64 *v = Add2Ptr(e, le16_to_cpu(e->size) - sizeof(__le64)); return le64_to_cpu(*v); } static inline struct NTFS_DE *de_get_next(const struct NTFS_DE *e) { return Add2Ptr(e, le16_to_cpu(e->size)); } static inline struct ATTR_FILE_NAME *de_get_fname(const struct NTFS_DE *e) { return le16_to_cpu(e->key_size) >= SIZEOF_ATTRIBUTE_FILENAME ? Add2Ptr(e, sizeof(struct NTFS_DE)) : NULL; } static inline bool de_is_last(const struct NTFS_DE *e) { return e->flags & NTFS_IE_LAST; } static inline bool de_has_vcn(const struct NTFS_DE *e) { return e->flags & NTFS_IE_HAS_SUBNODES; } static inline bool de_has_vcn_ex(const struct NTFS_DE *e) { return (e->flags & NTFS_IE_HAS_SUBNODES) && (u64)(-1) != *((u64 *)Add2Ptr(e, le16_to_cpu(e->size) - sizeof(__le64))); } #define MAX_BYTES_PER_NAME_ENTRY \ ALIGN(sizeof(struct NTFS_DE) + \ offsetof(struct ATTR_FILE_NAME, name) + \ NTFS_NAME_LEN * sizeof(short), 8) #define NTFS_INDEX_HDR_HAS_SUBNODES cpu_to_le32(1) struct INDEX_HDR { __le32 de_off; // 0x00: The offset from the start of this structure // to the first NTFS_DE. __le32 used; // 0x04: The size of this structure plus all // entries (quad-word aligned). __le32 total; // 0x08: The allocated size of for this structure plus all entries. __le32 flags; // 0x0C: 0x00 = Small directory, 0x01 = Large directory. // // de_off + used <= total // }; static_assert(sizeof(struct INDEX_HDR) == 0x10); static inline struct NTFS_DE *hdr_first_de(const struct INDEX_HDR *hdr) { u32 de_off = le32_to_cpu(hdr->de_off); u32 used = le32_to_cpu(hdr->used); struct NTFS_DE *e; u16 esize; if (de_off >= used || size_add(de_off, sizeof(struct NTFS_DE)) > used) return NULL; e = Add2Ptr(hdr, de_off); esize = le16_to_cpu(e->size); if (esize < sizeof(struct NTFS_DE) || de_off + esize > used) return NULL; return e; } static inline struct NTFS_DE *hdr_next_de(const struct INDEX_HDR *hdr, const struct NTFS_DE *e) { size_t off = PtrOffset(hdr, e); u32 used = le32_to_cpu(hdr->used); u16 esize; if (off >= used) return NULL; esize = le16_to_cpu(e->size); if (esize < sizeof(struct NTFS_DE) || off + esize + sizeof(struct NTFS_DE) > used) return NULL; return Add2Ptr(e, esize); } static inline bool hdr_has_subnode(const struct INDEX_HDR *hdr) { return hdr->flags & NTFS_INDEX_HDR_HAS_SUBNODES; } struct INDEX_BUFFER { struct NTFS_RECORD_HEADER rhdr; // 'INDX' __le64 vbn; // 0x10: vcn if index >= cluster or vsn id index < cluster struct INDEX_HDR ihdr; // 0x18: }; static_assert(sizeof(struct INDEX_BUFFER) == 0x28); static inline bool ib_is_empty(const struct INDEX_BUFFER *ib) { const struct NTFS_DE *first = hdr_first_de(&ib->ihdr); return !first || de_is_last(first); } static inline bool ib_is_leaf(const struct INDEX_BUFFER *ib) { return !(ib->ihdr.flags & NTFS_INDEX_HDR_HAS_SUBNODES); } /* Index root structure ( 0x90 ). */ enum COLLATION_RULE { NTFS_COLLATION_TYPE_BINARY = cpu_to_le32(0), // $I30 NTFS_COLLATION_TYPE_FILENAME = cpu_to_le32(0x01), // $SII of $Secure and $Q of Quota NTFS_COLLATION_TYPE_UINT = cpu_to_le32(0x10), // $O of Quota NTFS_COLLATION_TYPE_SID = cpu_to_le32(0x11), // $SDH of $Secure NTFS_COLLATION_TYPE_SECURITY_HASH = cpu_to_le32(0x12), // $O of ObjId and "$R" for Reparse NTFS_COLLATION_TYPE_UINTS = cpu_to_le32(0x13) }; static_assert(sizeof(enum COLLATION_RULE) == 4); // struct INDEX_ROOT { enum ATTR_TYPE type; // 0x00: The type of attribute to index on. enum COLLATION_RULE rule; // 0x04: The rule. __le32 index_block_size;// 0x08: The size of index record. u8 index_block_clst; // 0x0C: The number of clusters or sectors per index. u8 res[3]; struct INDEX_HDR ihdr; // 0x10: }; static_assert(sizeof(struct INDEX_ROOT) == 0x20); static_assert(offsetof(struct INDEX_ROOT, ihdr) == 0x10); #define VOLUME_FLAG_DIRTY cpu_to_le16(0x0001) #define VOLUME_FLAG_RESIZE_LOG_FILE cpu_to_le16(0x0002) struct VOLUME_INFO { __le64 res1; // 0x00 u8 major_ver; // 0x08: NTFS major version number (before .) u8 minor_ver; // 0x09: NTFS minor version number (after .) __le16 flags; // 0x0A: Volume flags, see VOLUME_FLAG_XXX }; // sizeof=0xC #define SIZEOF_ATTRIBUTE_VOLUME_INFO 0xc #define NTFS_LABEL_MAX_LENGTH (0x100 / sizeof(short)) #define NTFS_ATTR_INDEXABLE cpu_to_le32(0x00000002) #define NTFS_ATTR_DUPALLOWED cpu_to_le32(0x00000004) #define NTFS_ATTR_MUST_BE_INDEXED cpu_to_le32(0x00000010) #define NTFS_ATTR_MUST_BE_NAMED cpu_to_le32(0x00000020) #define NTFS_ATTR_MUST_BE_RESIDENT cpu_to_le32(0x00000040) #define NTFS_ATTR_LOG_ALWAYS cpu_to_le32(0x00000080) /* $AttrDef file entry. */ struct ATTR_DEF_ENTRY { __le16 name[0x40]; // 0x00: Attr name. enum ATTR_TYPE type; // 0x80: struct ATTRIB type. __le32 res; // 0x84: enum COLLATION_RULE rule; // 0x88: __le32 flags; // 0x8C: NTFS_ATTR_XXX (see above). __le64 min_sz; // 0x90: Minimum attribute data size. __le64 max_sz; // 0x98: Maximum attribute data size. }; static_assert(sizeof(struct ATTR_DEF_ENTRY) == 0xa0); /* Object ID (0x40) */ struct OBJECT_ID { struct GUID ObjId; // 0x00: Unique Id assigned to file. // Birth Volume Id is the Object Id of the Volume on. // which the Object Id was allocated. It never changes. struct GUID BirthVolumeId; //0x10: // Birth Object Id is the first Object Id that was // ever assigned to this MFT Record. I.e. If the Object Id // is changed for some reason, this field will reflect the // original value of the Object Id. struct GUID BirthObjectId; // 0x20: // Domain Id is currently unused but it is intended to be // used in a network environment where the local machine is // part of a Windows 2000 Domain. This may be used in a Windows // 2000 Advanced Server managed domain. struct GUID DomainId; // 0x30: }; static_assert(sizeof(struct OBJECT_ID) == 0x40); /* O Directory entry structure ( rule = 0x13 ) */ struct NTFS_DE_O { struct NTFS_DE de; struct GUID ObjId; // 0x10: Unique Id assigned to file. struct MFT_REF ref; // 0x20: MFT record number with this file. // Birth Volume Id is the Object Id of the Volume on // which the Object Id was allocated. It never changes. struct GUID BirthVolumeId; // 0x28: // Birth Object Id is the first Object Id that was // ever assigned to this MFT Record. I.e. If the Object Id // is changed for some reason, this field will reflect the // original value of the Object Id. // This field is valid if data_size == 0x48. struct GUID BirthObjectId; // 0x38: // Domain Id is currently unused but it is intended // to be used in a network environment where the local // machine is part of a Windows 2000 Domain. This may be // used in a Windows 2000 Advanced Server managed domain. struct GUID BirthDomainId; // 0x48: }; static_assert(sizeof(struct NTFS_DE_O) == 0x58); /* Q Directory entry structure ( rule = 0x11 ) */ struct NTFS_DE_Q { struct NTFS_DE de; __le32 owner_id; // 0x10: Unique Id assigned to file /* here is 0x30 bytes of user quota. NOTE: 4 byte aligned! */ __le32 Version; // 0x14: 0x02 __le32 Flags; // 0x18: Quota flags, see above __le64 BytesUsed; // 0x1C: __le64 ChangeTime; // 0x24: __le64 WarningLimit; // 0x28: __le64 HardLimit; // 0x34: __le64 ExceededTime; // 0x3C: // SID is placed here }__packed; // sizeof() = 0x44 static_assert(sizeof(struct NTFS_DE_Q) == 0x44); #define SecurityDescriptorsBlockSize 0x40000 // 256K #define SecurityDescriptorMaxSize 0x20000 // 128K #define Log2OfSecurityDescriptorsBlockSize 18 struct SECURITY_KEY { __le32 hash; // Hash value for descriptor __le32 sec_id; // Security Id (guaranteed unique) }; /* Security descriptors (the content of $Secure::SDS data stream) */ struct SECURITY_HDR { struct SECURITY_KEY key; // 0x00: Security Key. __le64 off; // 0x08: Offset of this entry in the file. __le32 size; // 0x10: Size of this entry, 8 byte aligned. /* * Security descriptor itself is placed here. * Total size is 16 byte aligned. */ } __packed; static_assert(sizeof(struct SECURITY_HDR) == 0x14); /* SII Directory entry structure */ struct NTFS_DE_SII { struct NTFS_DE de; __le32 sec_id; // 0x10: Key: sizeof(security_id) = wKeySize struct SECURITY_HDR sec_hdr; // 0x14: } __packed; static_assert(offsetof(struct NTFS_DE_SII, sec_hdr) == 0x14); static_assert(sizeof(struct NTFS_DE_SII) == 0x28); /* SDH Directory entry structure */ struct NTFS_DE_SDH { struct NTFS_DE de; struct SECURITY_KEY key; // 0x10: Key struct SECURITY_HDR sec_hdr; // 0x18: Data __le16 magic[2]; // 0x2C: 0x00490049 "I I" }; #define SIZEOF_SDH_DIRENTRY 0x30 struct REPARSE_KEY { __le32 ReparseTag; // 0x00: Reparse Tag struct MFT_REF ref; // 0x04: MFT record number with this file }; // sizeof() = 0x0C static_assert(offsetof(struct REPARSE_KEY, ref) == 0x04); #define SIZEOF_REPARSE_KEY 0x0C /* Reparse Directory entry structure */ struct NTFS_DE_R { struct NTFS_DE de; struct REPARSE_KEY key; // 0x10: Reparse Key. u32 zero; // 0x1c: }; // sizeof() = 0x20 static_assert(sizeof(struct NTFS_DE_R) == 0x20); /* CompressReparseBuffer.WofVersion */ #define WOF_CURRENT_VERSION cpu_to_le32(1) /* CompressReparseBuffer.WofProvider */ #define WOF_PROVIDER_WIM cpu_to_le32(1) /* CompressReparseBuffer.WofProvider */ #define WOF_PROVIDER_SYSTEM cpu_to_le32(2) /* CompressReparseBuffer.ProviderVer */ #define WOF_PROVIDER_CURRENT_VERSION cpu_to_le32(1) #define WOF_COMPRESSION_XPRESS4K cpu_to_le32(0) // 4k #define WOF_COMPRESSION_LZX32K cpu_to_le32(1) // 32k #define WOF_COMPRESSION_XPRESS8K cpu_to_le32(2) // 8k #define WOF_COMPRESSION_XPRESS16K cpu_to_le32(3) // 16k /* * ATTR_REPARSE (0xC0) * * The reparse struct GUID structure is used by all 3rd party layered drivers to * store data in a reparse point. For non-Microsoft tags, The struct GUID field * cannot be GUID_NULL. * The constraints on reparse tags are defined below. * Microsoft tags can also be used with this format of the reparse point buffer. */ struct REPARSE_POINT { __le32 ReparseTag; // 0x00: __le16 ReparseDataLength;// 0x04: __le16 Reserved; struct GUID Guid; // 0x08: // // Here GenericReparseBuffer is placed // }; static_assert(sizeof(struct REPARSE_POINT) == 0x18); /* * The value of the following constant needs to satisfy the following * conditions: * (1) Be at least as large as the largest of the reserved tags. * (2) Be strictly smaller than all the tags in use. */ #define IO_REPARSE_TAG_RESERVED_RANGE 1 /* * The reparse tags are a ULONG. The 32 bits are laid out as follows: * * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 * +-+-+-+-+-----------------------+-------------------------------+ * |M|R|N|R| Reserved bits | Reparse Tag Value | * +-+-+-+-+-----------------------+-------------------------------+ * * M is the Microsoft bit. When set to 1, it denotes a tag owned by Microsoft. * All ISVs must use a tag with a 0 in this position. * Note: If a Microsoft tag is used by non-Microsoft software, the * behavior is not defined. * * R is reserved. Must be zero for non-Microsoft tags. * * N is name surrogate. When set to 1, the file represents another named * entity in the system. * * The M and N bits are OR-able. * The following macros check for the M and N bit values: */ /* * Macro to determine whether a reparse point tag corresponds to a tag * owned by Microsoft. */ #define IsReparseTagMicrosoft(_tag) (((_tag)&IO_REPARSE_TAG_MICROSOFT)) /* Macro to determine whether a reparse point tag is a name surrogate. */ #define IsReparseTagNameSurrogate(_tag) (((_tag)&IO_REPARSE_TAG_NAME_SURROGATE)) /* * The following constant represents the bits that are valid to use in * reparse tags. */ #define IO_REPARSE_TAG_VALID_VALUES 0xF000FFFF /* * Macro to determine whether a reparse tag is a valid tag. */ #define IsReparseTagValid(_tag) \ (!((_tag) & ~IO_REPARSE_TAG_VALID_VALUES) && \ ((_tag) > IO_REPARSE_TAG_RESERVED_RANGE)) /* Microsoft tags for reparse points. */ enum IO_REPARSE_TAG { IO_REPARSE_TAG_SYMBOLIC_LINK = cpu_to_le32(0), IO_REPARSE_TAG_NAME_SURROGATE = cpu_to_le32(0x20000000), IO_REPARSE_TAG_MICROSOFT = cpu_to_le32(0x80000000), IO_REPARSE_TAG_MOUNT_POINT = cpu_to_le32(0xA0000003), IO_REPARSE_TAG_SYMLINK = cpu_to_le32(0xA000000C), IO_REPARSE_TAG_HSM = cpu_to_le32(0xC0000004), IO_REPARSE_TAG_SIS = cpu_to_le32(0x80000007), IO_REPARSE_TAG_DEDUP = cpu_to_le32(0x80000013), IO_REPARSE_TAG_COMPRESS = cpu_to_le32(0x80000017), /* * The reparse tag 0x80000008 is reserved for Microsoft internal use. * May be published in the future. */ /* Microsoft reparse tag reserved for DFS */ IO_REPARSE_TAG_DFS = cpu_to_le32(0x8000000A), /* Microsoft reparse tag reserved for the file system filter manager. */ IO_REPARSE_TAG_FILTER_MANAGER = cpu_to_le32(0x8000000B), /* Non-Microsoft tags for reparse points */ /* Tag allocated to CONGRUENT, May 2000. Used by IFSTEST. */ IO_REPARSE_TAG_IFSTEST_CONGRUENT = cpu_to_le32(0x00000009), /* Tag allocated to ARKIVIO. */ IO_REPARSE_TAG_ARKIVIO = cpu_to_le32(0x0000000C), /* Tag allocated to SOLUTIONSOFT. */ IO_REPARSE_TAG_SOLUTIONSOFT = cpu_to_le32(0x2000000D), /* Tag allocated to COMMVAULT. */ IO_REPARSE_TAG_COMMVAULT = cpu_to_le32(0x0000000E), /* OneDrive?? */ IO_REPARSE_TAG_CLOUD = cpu_to_le32(0x9000001A), IO_REPARSE_TAG_CLOUD_1 = cpu_to_le32(0x9000101A), IO_REPARSE_TAG_CLOUD_2 = cpu_to_le32(0x9000201A), IO_REPARSE_TAG_CLOUD_3 = cpu_to_le32(0x9000301A), IO_REPARSE_TAG_CLOUD_4 = cpu_to_le32(0x9000401A), IO_REPARSE_TAG_CLOUD_5 = cpu_to_le32(0x9000501A), IO_REPARSE_TAG_CLOUD_6 = cpu_to_le32(0x9000601A), IO_REPARSE_TAG_CLOUD_7 = cpu_to_le32(0x9000701A), IO_REPARSE_TAG_CLOUD_8 = cpu_to_le32(0x9000801A), IO_REPARSE_TAG_CLOUD_9 = cpu_to_le32(0x9000901A), IO_REPARSE_TAG_CLOUD_A = cpu_to_le32(0x9000A01A), IO_REPARSE_TAG_CLOUD_B = cpu_to_le32(0x9000B01A), IO_REPARSE_TAG_CLOUD_C = cpu_to_le32(0x9000C01A), IO_REPARSE_TAG_CLOUD_D = cpu_to_le32(0x9000D01A), IO_REPARSE_TAG_CLOUD_E = cpu_to_le32(0x9000E01A), IO_REPARSE_TAG_CLOUD_F = cpu_to_le32(0x9000F01A), }; #define SYMLINK_FLAG_RELATIVE 1 /* Microsoft reparse buffer. (see DDK for details) */ struct REPARSE_DATA_BUFFER { __le32 ReparseTag; // 0x00: __le16 ReparseDataLength; // 0x04: __le16 Reserved; union { /* If ReparseTag == 0xA0000003 (IO_REPARSE_TAG_MOUNT_POINT) */ struct { __le16 SubstituteNameOffset; // 0x08 __le16 SubstituteNameLength; // 0x0A __le16 PrintNameOffset; // 0x0C __le16 PrintNameLength; // 0x0E __le16 PathBuffer[]; // 0x10 } MountPointReparseBuffer; /* * If ReparseTag == 0xA000000C (IO_REPARSE_TAG_SYMLINK) * https://msdn.microsoft.com/en-us/library/cc232006.aspx */ struct { __le16 SubstituteNameOffset; // 0x08 __le16 SubstituteNameLength; // 0x0A __le16 PrintNameOffset; // 0x0C __le16 PrintNameLength; // 0x0E // 0-absolute path 1- relative path, SYMLINK_FLAG_RELATIVE __le32 Flags; // 0x10 __le16 PathBuffer[]; // 0x14 } SymbolicLinkReparseBuffer; /* If ReparseTag == 0x80000017U */ struct { __le32 WofVersion; // 0x08 == 1 /* * 1 - WIM backing provider ("WIMBoot"), * 2 - System compressed file provider */ __le32 WofProvider; // 0x0C: __le32 ProviderVer; // 0x10: == 1 WOF_FILE_PROVIDER_CURRENT_VERSION == 1 __le32 CompressionFormat; // 0x14: 0, 1, 2, 3. See WOF_COMPRESSION_XXX } CompressReparseBuffer; struct { u8 DataBuffer[1]; // 0x08: } GenericReparseBuffer; }; }; /* ATTR_EA_INFO (0xD0) */ #define FILE_NEED_EA 0x80 // See ntifs.h /* * FILE_NEED_EA, indicates that the file to which the EA belongs cannot be * interpreted without understanding the associated extended attributes. */ struct EA_INFO { __le16 size_pack; // 0x00: Size of buffer to hold in packed form. __le16 count; // 0x02: Count of EA's with FILE_NEED_EA bit set. __le32 size; // 0x04: Size of buffer to hold in unpacked form. }; static_assert(sizeof(struct EA_INFO) == 8); /* ATTR_EA (0xE0) */ struct EA_FULL { __le32 size; // 0x00: (not in packed) u8 flags; // 0x04: u8 name_len; // 0x05: __le16 elength; // 0x06: u8 name[]; // 0x08: }; static_assert(offsetof(struct EA_FULL, name) == 8); #define ACL_REVISION 2 #define ACL_REVISION_DS 4 #define SE_SELF_RELATIVE cpu_to_le16(0x8000) struct SECURITY_DESCRIPTOR_RELATIVE { u8 Revision; u8 Sbz1; __le16 Control; __le32 Owner; __le32 Group; __le32 Sacl; __le32 Dacl; }; static_assert(sizeof(struct SECURITY_DESCRIPTOR_RELATIVE) == 0x14); struct ACE_HEADER { u8 AceType; u8 AceFlags; __le16 AceSize; }; static_assert(sizeof(struct ACE_HEADER) == 4); struct ACL { u8 AclRevision; u8 Sbz1; __le16 AclSize; __le16 AceCount; __le16 Sbz2; }; static_assert(sizeof(struct ACL) == 8); struct SID { u8 Revision; u8 SubAuthorityCount; u8 IdentifierAuthority[6]; __le32 SubAuthority[]; }; static_assert(offsetof(struct SID, SubAuthority) == 8); #endif /* _LINUX_NTFS3_NTFS_H */ // clang-format on
39 39 39 39 39 28 28 28 28 39 39 39 39 39 39 39 39 39 39 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/file.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/file.c * * Copyright (C) 1991, 1992 Linus Torvalds * * ext4 fs regular file handling primitives * * 64-bit file support on 64-bit platforms by Jakub Jelinek * (jj@sunsite.ms.mff.cuni.cz) */ #include <linux/time.h> #include <linux/fs.h> #include <linux/iomap.h> #include <linux/mount.h> #include <linux/path.h> #include <linux/dax.h> #include <linux/quotaops.h> #include <linux/pagevec.h> #include <linux/uio.h> #include <linux/mman.h> #include <linux/backing-dev.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include "truncate.h" /* * Returns %true if the given DIO request should be attempted with DIO, or * %false if it should fall back to buffered I/O. * * DIO isn't well specified; when it's unsupported (either due to the request * being misaligned, or due to the file not supporting DIO at all), filesystems * either fall back to buffered I/O or return EINVAL. For files that don't use * any special features like encryption or verity, ext4 has traditionally * returned EINVAL for misaligned DIO. iomap_dio_rw() uses this convention too. * In this case, we should attempt the DIO, *not* fall back to buffered I/O. * * In contrast, in cases where DIO is unsupported due to ext4 features, ext4 * traditionally falls back to buffered I/O. * * This function implements the traditional ext4 behavior in all these cases. */ static bool ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter) { struct inode *inode = file_inode(iocb->ki_filp); u32 dio_align = ext4_dio_alignment(inode); if (dio_align == 0) return false; if (dio_align == 1) return true; return IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), dio_align); } static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) { ssize_t ret; struct inode *inode = file_inode(iocb->ki_filp); if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock_shared(inode)) return -EAGAIN; } else { inode_lock_shared(inode); } if (!ext4_should_use_dio(iocb, to)) { inode_unlock_shared(inode); /* * Fallback to buffered I/O if the operation being performed on * the inode is not supported by direct I/O. The IOCB_DIRECT * flag needs to be cleared here in order to ensure that the * direct I/O path within generic_file_read_iter() is not * taken. */ iocb->ki_flags &= ~IOCB_DIRECT; return generic_file_read_iter(iocb, to); } ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, NULL, 0); inode_unlock_shared(inode); file_accessed(iocb->ki_filp); return ret; } #ifdef CONFIG_FS_DAX static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock_shared(inode)) return -EAGAIN; } else { inode_lock_shared(inode); } /* * Recheck under inode lock - at this point we are sure it cannot * change anymore */ if (!IS_DAX(inode)) { inode_unlock_shared(inode); /* Fallback to buffered IO in case we cannot support DAX */ return generic_file_read_iter(iocb, to); } ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops); inode_unlock_shared(inode); file_accessed(iocb->ki_filp); return ret; } #endif static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (!iov_iter_count(to)) return 0; /* skip atime */ #ifdef CONFIG_FS_DAX if (IS_DAX(inode)) return ext4_dax_read_iter(iocb, to); #endif if (iocb->ki_flags & IOCB_DIRECT) return ext4_dio_read_iter(iocb, to); return generic_file_read_iter(iocb, to); } static ssize_t ext4_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct inode *inode = file_inode(in); if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; return filemap_splice_read(in, ppos, pipe, len, flags); } /* * Called when an inode is released. Note that this is different * from ext4_file_open: open gets called at every open, but release * gets called only when /all/ the files are closed. */ static int ext4_release_file(struct inode *inode, struct file *filp) { if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) { ext4_alloc_da_blocks(inode); ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); } /* if we are the last writer on the inode, drop the block reservation */ if ((filp->f_mode & FMODE_WRITE) && (atomic_read(&inode->i_writecount) == 1) && !EXT4_I(inode)->i_reserved_data_blocks) { down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); } if (is_dx(inode) && filp->private_data) ext4_htree_free_dir_info(filp->private_data); return 0; } /* * This tests whether the IO in question is block-aligned or not. * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they * are converted to written only after the IO is complete. Until they are * mapped, these blocks appear as holes, so dio_zero_block() will assume that * it needs to zero out portions of the start and/or end block. If 2 AIO * threads are at work on the same unwritten block, they must be synchronized * or one thread will zero the other's data, causing corruption. */ static bool ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos) { struct super_block *sb = inode->i_sb; unsigned long blockmask = sb->s_blocksize - 1; if ((pos | iov_iter_alignment(from)) & blockmask) return true; return false; } static bool ext4_extending_io(struct inode *inode, loff_t offset, size_t len) { if (offset + len > i_size_read(inode) || offset + len > EXT4_I(inode)->i_disksize) return true; return false; } /* Is IO overwriting allocated or initialized blocks? */ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len, bool *unwritten) { struct ext4_map_blocks map; unsigned int blkbits = inode->i_blkbits; int err, blklen; if (pos + len > i_size_read(inode)) return false; map.m_lblk = pos >> blkbits; map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits); blklen = map.m_len; err = ext4_map_blocks(NULL, inode, &map, 0); if (err != blklen) return false; /* * 'err==len' means that all of the blocks have been preallocated, * regardless of whether they have been initialized or not. We need to * check m_flags to distinguish the unwritten extents. */ *unwritten = !(map.m_flags & EXT4_MAP_MAPPED); return true; } static ssize_t ext4_generic_write_checks(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; ret = generic_write_checks(iocb, from); if (ret <= 0) return ret; /* * If we have encountered a bitmap-format file, the size limit * is smaller than s_maxbytes, which is for extent-mapped files. */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) return -EFBIG; iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); } return iov_iter_count(from); } static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret, count; count = ext4_generic_write_checks(iocb, from); if (count <= 0) return count; ret = file_modified(iocb->ki_filp); if (ret) return ret; return count; } static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret; struct inode *inode = file_inode(iocb->ki_filp); if (iocb->ki_flags & IOCB_NOWAIT) return -EOPNOTSUPP; inode_lock(inode); ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; ret = generic_perform_write(iocb, from); out: inode_unlock(inode); if (unlikely(ret <= 0)) return ret; return generic_write_sync(iocb, ret); } static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, ssize_t written, ssize_t count) { handle_t *handle; lockdep_assert_held_write(&inode->i_rwsem); handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) return PTR_ERR(handle); if (ext4_update_inode_size(inode, offset + written)) { int ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) { ext4_journal_stop(handle); return ret; } } if ((written == count) && inode->i_nlink) ext4_orphan_del(handle, inode); ext4_journal_stop(handle); return written; } /* * Clean up the inode after DIO or DAX extending write has completed and the * inode size has been updated using ext4_handle_inode_extension(). */ static void ext4_inode_extension_cleanup(struct inode *inode, bool need_trunc) { lockdep_assert_held_write(&inode->i_rwsem); if (need_trunc) { ext4_truncate_failed_write(inode); /* * If the truncate operation failed early, then the inode may * still be on the orphan list. In that case, we need to try * remove the inode from the in-memory linked list. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); return; } /* * If i_disksize got extended either due to writeback of delalloc * blocks or extending truncate while the DIO was running we could fail * to cleanup the orphan list in ext4_handle_inode_extension(). Do it * now. */ if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) { handle_t *handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { /* * The write has successfully completed. Not much to * do with the error here so just cleanup the orphan * list and hope for the best. */ ext4_orphan_del(NULL, inode); return; } ext4_orphan_del(handle, inode); ext4_journal_stop(handle); } } static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error, unsigned int flags) { loff_t pos = iocb->ki_pos; struct inode *inode = file_inode(iocb->ki_filp); if (!error && size && (flags & IOMAP_DIO_UNWRITTEN) && (iocb->ki_flags & IOCB_ATOMIC)) error = ext4_convert_unwritten_extents_atomic(NULL, inode, pos, size); else if (!error && size && flags & IOMAP_DIO_UNWRITTEN) error = ext4_convert_unwritten_extents(NULL, inode, pos, size); if (error) return error; /* * Note that EXT4_I(inode)->i_disksize can get extended up to * inode->i_size while the I/O was running due to writeback of delalloc * blocks. But the code in ext4_iomap_alloc() is careful to use * zeroed/unwritten extents if this is possible; thus we won't leave * uninitialized blocks in a file even if we didn't succeed in writing * as much as we intended. Also we can race with truncate or write * expanding the file so we have to be a bit careful here. */ if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize) && pos + size <= i_size_read(inode)) return 0; error = ext4_handle_inode_extension(inode, pos, size, size); return error < 0 ? error : 0; } static const struct iomap_dio_ops ext4_dio_write_ops = { .end_io = ext4_dio_write_end_io, }; /* * The intention here is to start with shared lock acquired then see if any * condition requires an exclusive inode lock. If yes, then we restart the * whole operation by releasing the shared lock and acquiring exclusive lock. * * - For unaligned_io we never take shared lock as it may cause data corruption * when two unaligned IO tries to modify the same block e.g. while zeroing. * * - For extending writes case we don't take the shared lock, since it requires * updating inode i_disksize and/or orphan handling with exclusive lock. * * - shared locking will only be true mostly with overwrites, including * initialized blocks and unwritten blocks. For overwrite unwritten blocks * we protect splitting extents by i_data_sem in ext4_inode_info, so we can * also release exclusive i_rwsem lock. * * - Otherwise we will switch to exclusive i_rwsem lock. */ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, bool *ilock_shared, bool *extend, bool *unwritten, int *dio_flags) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); loff_t offset; size_t count; ssize_t ret; bool overwrite, unaligned_io; restart: ret = ext4_generic_write_checks(iocb, from); if (ret <= 0) goto out; offset = iocb->ki_pos; count = ret; unaligned_io = ext4_unaligned_io(inode, from, offset); *extend = ext4_extending_io(inode, offset, count); overwrite = ext4_overwrite_io(inode, offset, count, unwritten); /* * Determine whether we need to upgrade to an exclusive lock. This is * required to change security info in file_modified(), for extending * I/O, any form of non-overwrite I/O, and unaligned I/O to unwritten * extents (as partial block zeroing may be required). * * Note that unaligned writes are allowed under shared lock so long as * they are pure overwrites. Otherwise, concurrent unaligned writes risk * data corruption due to partial block zeroing in the dio layer, and so * the I/O must occur exclusively. */ if (*ilock_shared && ((!IS_NOSEC(inode) || *extend || !overwrite || (unaligned_io && *unwritten)))) { if (iocb->ki_flags & IOCB_NOWAIT) { ret = -EAGAIN; goto out; } inode_unlock_shared(inode); *ilock_shared = false; inode_lock(inode); goto restart; } /* * Now that locking is settled, determine dio flags and exclusivity * requirements. We don't use DIO_OVERWRITE_ONLY because we enforce * behavior already. The inode lock is already held exclusive if the * write is non-overwrite or extending, so drain all outstanding dio and * set the force wait dio flag. */ if (!*ilock_shared && (unaligned_io || *extend)) { if (iocb->ki_flags & IOCB_NOWAIT) { ret = -EAGAIN; goto out; } if (unaligned_io && (!overwrite || *unwritten)) inode_dio_wait(inode); *dio_flags = IOMAP_DIO_FORCE_WAIT; } ret = file_modified(file); if (ret < 0) goto out; return count; out: if (*ilock_shared) inode_unlock_shared(inode); else inode_unlock(inode); return ret; } static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret; handle_t *handle; struct inode *inode = file_inode(iocb->ki_filp); loff_t offset = iocb->ki_pos; size_t count = iov_iter_count(from); const struct iomap_ops *iomap_ops = &ext4_iomap_ops; bool extend = false, unwritten = false; bool ilock_shared = true; int dio_flags = 0; /* * Quick check here without any i_rwsem lock to see if it is extending * IO. A more reliable check is done in ext4_dio_write_checks() with * proper locking in place. */ if (offset + count > i_size_read(inode)) ilock_shared = false; if (iocb->ki_flags & IOCB_NOWAIT) { if (ilock_shared) { if (!inode_trylock_shared(inode)) return -EAGAIN; } else { if (!inode_trylock(inode)) return -EAGAIN; } } else { if (ilock_shared) inode_lock_shared(inode); else inode_lock(inode); } /* Fallback to buffered I/O if the inode does not support direct I/O. */ if (!ext4_should_use_dio(iocb, from)) { if (ilock_shared) inode_unlock_shared(inode); else inode_unlock(inode); return ext4_buffered_write_iter(iocb, from); } /* * Prevent inline data from being created since we are going to allocate * blocks for DIO. We know the inode does not currently have inline data * because ext4_should_use_dio() checked for it, but we have to clear * the state flag before the write checks because a lock cycle could * introduce races with other writers. */ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend, &unwritten, &dio_flags); if (ret <= 0) return ret; offset = iocb->ki_pos; count = ret; if (extend) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; } ret = ext4_orphan_add(handle, inode); ext4_journal_stop(handle); if (ret) goto out; } if (ilock_shared && !unwritten) iomap_ops = &ext4_iomap_overwrite_ops; ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, dio_flags, NULL, 0); if (ret == -ENOTBLK) ret = 0; if (extend) { /* * We always perform extending DIO write synchronously so by * now the IO is completed and ext4_handle_inode_extension() * was called. Cleanup the inode in case of error or race with * writeback of delalloc blocks. */ WARN_ON_ONCE(ret == -EIOCBQUEUED); ext4_inode_extension_cleanup(inode, ret < 0); } out: if (ilock_shared) inode_unlock_shared(inode); else inode_unlock(inode); if (ret >= 0 && iov_iter_count(from)) { ssize_t err; loff_t endbyte; /* * There is no support for atomic writes on buffered-io yet, * we should never fallback to buffered-io for DIO atomic * writes. */ WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC); offset = iocb->ki_pos; err = ext4_buffered_write_iter(iocb, from); if (err < 0) return err; /* * We need to ensure that the pages within the page cache for * the range covered by this I/O are written to disk and * invalidated. This is in attempt to preserve the expected * direct I/O semantics in the case we fallback to buffered I/O * to complete off the I/O request. */ ret += err; endbyte = offset + err - 1; err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping, offset, endbyte); if (!err) invalidate_mapping_pages(iocb->ki_filp->f_mapping, offset >> PAGE_SHIFT, endbyte >> PAGE_SHIFT); } return ret; } #ifdef CONFIG_FS_DAX static ssize_t ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret; size_t count; loff_t offset; handle_t *handle; bool extend = false; struct inode *inode = file_inode(iocb->ki_filp); if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock(inode)) return -EAGAIN; } else { inode_lock(inode); } ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; offset = iocb->ki_pos; count = iov_iter_count(from); if (offset + count > EXT4_I(inode)->i_disksize) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; } ret = ext4_orphan_add(handle, inode); if (ret) { ext4_journal_stop(handle); goto out; } extend = true; ext4_journal_stop(handle); } ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); if (extend) { ret = ext4_handle_inode_extension(inode, offset, ret, count); ext4_inode_extension_cleanup(inode, ret < (ssize_t)count); } out: inode_unlock(inode); if (ret > 0) ret = generic_write_sync(iocb, ret); return ret; } #endif static ssize_t ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { int ret; struct inode *inode = file_inode(iocb->ki_filp); ret = ext4_emergency_state(inode->i_sb); if (unlikely(ret)) return ret; #ifdef CONFIG_FS_DAX if (IS_DAX(inode)) return ext4_dax_write_iter(iocb, from); #endif if (iocb->ki_flags & IOCB_ATOMIC) { size_t len = iov_iter_count(from); if (len < EXT4_SB(inode->i_sb)->s_awu_min || len > EXT4_SB(inode->i_sb)->s_awu_max) return -EINVAL; ret = generic_atomic_write_valid(iocb, from); if (ret) return ret; } if (iocb->ki_flags & IOCB_DIRECT) return ext4_dio_write_iter(iocb, from); else return ext4_buffered_write_iter(iocb, from); } #ifdef CONFIG_FS_DAX static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, unsigned int order) { int error = 0; vm_fault_t result; int retries = 0; handle_t *handle = NULL; struct inode *inode = file_inode(vmf->vma->vm_file); struct super_block *sb = inode->i_sb; /* * We have to distinguish real writes from writes which will result in a * COW page; COW writes should *not* poke the journal (the file will not * be changed). Doing so would cause unintended failures when mounted * read-only. * * We check for VM_SHARED rather than vmf->cow_page since the latter is * unset for order != 0 (i.e. only in do_cow_fault); for * other sizes, dax_iomap_fault will handle splitting / fallback so that * we eventually come back with a COW page. */ bool write = (vmf->flags & FAULT_FLAG_WRITE) && (vmf->vma->vm_flags & VM_SHARED); struct address_space *mapping = vmf->vma->vm_file->f_mapping; unsigned long pfn; if (write) { sb_start_pagefault(sb); file_update_time(vmf->vma->vm_file); filemap_invalidate_lock_shared(mapping); retry: handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, EXT4_DATA_TRANS_BLOCKS(sb)); if (IS_ERR(handle)) { filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(sb); return VM_FAULT_SIGBUS; } } else { filemap_invalidate_lock_shared(mapping); } result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops); if (write) { ext4_journal_stop(handle); if ((result & VM_FAULT_ERROR) && error == -ENOSPC && ext4_should_retry_alloc(sb, &retries)) goto retry; /* Handling synchronous page fault? */ if (result & VM_FAULT_NEEDDSYNC) result = dax_finish_sync_fault(vmf, order, pfn); filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(sb); } else { filemap_invalidate_unlock_shared(mapping); } return result; } static vm_fault_t ext4_dax_fault(struct vm_fault *vmf) { return ext4_dax_huge_fault(vmf, 0); } static const struct vm_operations_struct ext4_dax_vm_ops = { .fault = ext4_dax_fault, .huge_fault = ext4_dax_huge_fault, .page_mkwrite = ext4_dax_fault, .pfn_mkwrite = ext4_dax_fault, }; #else #define ext4_dax_vm_ops ext4_file_vm_ops #endif static const struct vm_operations_struct ext4_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = ext4_page_mkwrite, }; static int ext4_file_mmap_prepare(struct vm_area_desc *desc) { int ret; struct file *file = desc->file; struct inode *inode = file->f_mapping->host; struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; if (file->f_mode & FMODE_WRITE) ret = ext4_emergency_state(inode->i_sb); else ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0; if (unlikely(ret)) return ret; /* * We don't support synchronous mappings for non-DAX files and * for DAX files if underneath dax_device is not synchronous. */ if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file), dax_dev)) return -EOPNOTSUPP; file_accessed(file); if (IS_DAX(file_inode(file))) { desc->vm_ops = &ext4_dax_vm_ops; desc->vm_flags |= VM_HUGEPAGE; } else { desc->vm_ops = &ext4_file_vm_ops; } return 0; } static int ext4_sample_last_mounted(struct super_block *sb, struct vfsmount *mnt) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct path path; char buf[64], *cp; handle_t *handle; int err; if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED))) return 0; if (ext4_emergency_state(sb) || sb_rdonly(sb) || !sb_start_intwrite_trylock(sb)) return 0; ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED); /* * Sample where the filesystem has been mounted and * store it in the superblock for sysadmin convenience * when trying to sort through large numbers of block * devices or filesystem images. */ memset(buf, 0, sizeof(buf)); path.mnt = mnt; path.dentry = mnt->mnt_root; cp = d_path(&path, buf, sizeof(buf)); err = 0; if (IS_ERR(cp)) goto out; handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); err = PTR_ERR(handle); if (IS_ERR(handle)) goto out; BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, EXT4_JTR_NONE); if (err) goto out_journal; lock_buffer(sbi->s_sbh); strtomem_pad(sbi->s_es->s_last_mounted, cp, 0); ext4_superblock_csum_set(sb); unlock_buffer(sbi->s_sbh); ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); out_journal: ext4_journal_stop(handle); out: sb_end_intwrite(sb); return err; } static int ext4_file_open(struct inode *inode, struct file *filp) { int ret; if (filp->f_mode & FMODE_WRITE) ret = ext4_emergency_state(inode->i_sb); else ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0; if (unlikely(ret)) return ret; ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt); if (ret) return ret; ret = fscrypt_file_open(inode, filp); if (ret) return ret; ret = fsverity_file_open(inode, filp); if (ret) return ret; /* * Set up the jbd2_inode if we are opening the inode for * writing and the journal is present */ if (filp->f_mode & FMODE_WRITE) { ret = ext4_inode_attach_jinode(inode); if (ret < 0) return ret; } if (ext4_inode_can_atomic_write(inode)) filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; return dquot_file_open(inode, filp); } /* * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values * by calling generic_file_llseek_size() with the appropriate maxbytes * value for each. */ loff_t ext4_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; loff_t maxbytes = ext4_get_maxbytes(inode); switch (whence) { default: return generic_file_llseek_size(file, offset, whence, maxbytes, i_size_read(inode)); case SEEK_HOLE: inode_lock_shared(inode); offset = iomap_seek_hole(inode, offset, &ext4_iomap_report_ops); inode_unlock_shared(inode); break; case SEEK_DATA: inode_lock_shared(inode); offset = iomap_seek_data(inode, offset, &ext4_iomap_report_ops); inode_unlock_shared(inode); break; } if (offset < 0) return offset; return vfs_setpos(file, offset, maxbytes); } const struct file_operations ext4_file_operations = { .llseek = ext4_llseek, .read_iter = ext4_file_read_iter, .write_iter = ext4_file_write_iter, .iopoll = iocb_bio_iopoll, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, #endif .mmap_prepare = ext4_file_mmap_prepare, .open = ext4_file_open, .release = ext4_release_file, .fsync = ext4_sync_file, .get_unmapped_area = thp_get_unmapped_area, .splice_read = ext4_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ext4_fallocate, .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | FOP_DIO_PARALLEL_WRITE | FOP_DONTCACHE, }; const struct inode_operations ext4_file_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_file_getattr, .listxattr = ext4_listxattr, .get_inode_acl = ext4_get_acl, .set_acl = ext4_set_acl, .fiemap = ext4_fiemap, .fileattr_get = ext4_fileattr_get, .fileattr_set = ext4_fileattr_set, };
1 1709 8 45 55 48 54 1 38 43 43 10 18 13 4 13 13 13 13 10 10 8 206 8 9 8 8 5 16 13 16 3 6 13 13 13 5 5 5 2 3 1 2 3 3 2 13 13 3 2 1 13 13 8 13 13 7 7 7 7 38 38 38 38 10 10 7 7 7 7 37 37 44 44 44 43 43 19 38 13 2 38 38 10 10 37 38 38 38 10 38 38 39 36 44 43 11 25 25 38 20 17 14 10 20 53 9 53 53 53 3 53 52 53 9 3 8 53 52 25 1 47 48 48 48 49 49 48 48 48 49 49 49 47 48 37 13 55 25 1 21 25 55 49 43 43 20 55 43 56 46 46 46 46 46 46 46 56 46 55 56 46 56 46 2 46 46 46 46 46 46 45 46 44 44 22 11 11 46 45 56 1 25 25 1 24 14 55 55 55 54 55 53 53 53 53 53 14 35 40 24 30 18 18 1 18 11 39 39 17 17 17 17 17 8 1 1 39 6 5 5 5 2 1 2 6 1629 554 554 25 6 554 39 39 39 17 17 15 17 39 11 11 11 646 18 17 11 11 10 199 198 199 364 95 364 2 2 2 841 1300 1298 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 // SPDX-License-Identifier: GPL-2.0+ /* * User-space Probes (UProbes) * * Copyright (C) IBM Corporation, 2008-2012 * Authors: * Srikar Dronamraju * Jim Keniston * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra */ #include <linux/kernel.h> #include <linux/highmem.h> #include <linux/pagemap.h> /* read_mapping_page */ #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/export.h> #include <linux/rmap.h> /* anon_vma_prepare */ #include <linux/mmu_notifier.h> #include <linux/swap.h> /* folio_free_swap */ #include <linux/ptrace.h> /* user_enable_single_step */ #include <linux/kdebug.h> /* notifier mechanism */ #include <linux/percpu-rwsem.h> #include <linux/task_work.h> #include <linux/shmem_fs.h> #include <linux/khugepaged.h> #include <linux/rcupdate_trace.h> #include <linux/workqueue.h> #include <linux/srcu.h> #include <linux/oom.h> /* check_stable_address_space */ #include <linux/pagewalk.h> #include <linux/uprobes.h> #define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES) #define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE static struct rb_root uprobes_tree = RB_ROOT; /* * allows us to skip the uprobe_mmap if there are no uprobe events active * at this time. Probably a fine grained per inode count is better? */ #define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree) static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */ static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock); #define UPROBES_HASH_SZ 13 /* serialize uprobe->pending_list */ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem); /* Covers return_instance's uprobe lifetime. */ DEFINE_STATIC_SRCU(uretprobes_srcu); /* Have a copy of original instruction */ #define UPROBE_COPY_INSN 0 struct uprobe { struct rb_node rb_node; /* node in the rb tree */ refcount_t ref; struct rw_semaphore register_rwsem; struct rw_semaphore consumer_rwsem; struct list_head pending_list; struct list_head consumers; struct inode *inode; /* Also hold a ref to inode */ union { struct rcu_head rcu; struct work_struct work; }; loff_t offset; loff_t ref_ctr_offset; unsigned long flags; /* "unsigned long" so bitops work */ /* * The generic code assumes that it has two members of unknown type * owned by the arch-specific code: * * insn - copy_insn() saves the original instruction here for * arch_uprobe_analyze_insn(). * * ixol - potentially modified instruction to execute out of * line, copied to xol_area by xol_get_insn_slot(). */ struct arch_uprobe arch; }; struct delayed_uprobe { struct list_head list; struct uprobe *uprobe; struct mm_struct *mm; }; static DEFINE_MUTEX(delayed_uprobe_lock); static LIST_HEAD(delayed_uprobe_list); /* * Execute out of line area: anonymous executable mapping installed * by the probed task to execute the copy of the original instruction * mangled by set_swbp(). * * On a breakpoint hit, thread contests for a slot. It frees the * slot after singlestep. Currently a fixed number of slots are * allocated. */ struct xol_area { wait_queue_head_t wq; /* if all slots are busy */ unsigned long *bitmap; /* 0 = free slot */ struct page *page; /* * We keep the vma's vm_start rather than a pointer to the vma * itself. The probed process or a naughty kernel module could make * the vma go away, and we must handle that reasonably gracefully. */ unsigned long vaddr; /* Page(s) of instruction slots */ }; static void uprobe_warn(struct task_struct *t, const char *msg) { pr_warn("uprobe: %s:%d failed to %s\n", current->comm, current->pid, msg); } /* * valid_vma: Verify if the specified vma is an executable vma * Relax restrictions while unregistering: vm_flags might have * changed after breakpoint was inserted. * - is_register: indicates if we are in register context. * - Return 1 if the specified virtual address is in an * executable vma. */ static bool valid_vma(struct vm_area_struct *vma, bool is_register) { vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE; if (is_register) flags |= VM_WRITE; return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC; } static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset) { return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT); } static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) { return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start); } /** * is_swbp_insn - check if instruction is breakpoint instruction. * @insn: instruction to be checked. * Default implementation of is_swbp_insn * Returns true if @insn is a breakpoint instruction. */ bool __weak is_swbp_insn(uprobe_opcode_t *insn) { return *insn == UPROBE_SWBP_INSN; } /** * is_trap_insn - check if instruction is breakpoint instruction. * @insn: instruction to be checked. * Default implementation of is_trap_insn * Returns true if @insn is a breakpoint instruction. * * This function is needed for the case where an architecture has multiple * trap instructions (like powerpc). */ bool __weak is_trap_insn(uprobe_opcode_t *insn) { return is_swbp_insn(insn); } static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) { void *kaddr = kmap_atomic(page); memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len); kunmap_atomic(kaddr); } static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len) { void *kaddr = kmap_atomic(page); memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len); kunmap_atomic(kaddr); } static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode) { uprobe_opcode_t old_opcode; bool is_swbp; /* * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here. * We do not check if it is any other 'trap variant' which could * be conditional trap instruction such as the one powerpc supports. * * The logic is that we do not care if the underlying instruction * is a trap variant; uprobes always wins over any other (gdb) * breakpoint. */ copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); is_swbp = is_swbp_insn(&old_opcode); if (is_swbp_insn(new_opcode)) { if (is_swbp) /* register: already installed? */ return 0; } else { if (!is_swbp) /* unregister: was it changed by us? */ return 0; } return 1; } static struct delayed_uprobe * delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm) { struct delayed_uprobe *du; list_for_each_entry(du, &delayed_uprobe_list, list) if (du->uprobe == uprobe && du->mm == mm) return du; return NULL; } static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm) { struct delayed_uprobe *du; if (delayed_uprobe_check(uprobe, mm)) return 0; du = kzalloc(sizeof(*du), GFP_KERNEL); if (!du) return -ENOMEM; du->uprobe = uprobe; du->mm = mm; list_add(&du->list, &delayed_uprobe_list); return 0; } static void delayed_uprobe_delete(struct delayed_uprobe *du) { if (WARN_ON(!du)) return; list_del(&du->list); kfree(du); } static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm) { struct list_head *pos, *q; struct delayed_uprobe *du; if (!uprobe && !mm) return; list_for_each_safe(pos, q, &delayed_uprobe_list) { du = list_entry(pos, struct delayed_uprobe, list); if (uprobe && du->uprobe != uprobe) continue; if (mm && du->mm != mm) continue; delayed_uprobe_delete(du); } } static bool valid_ref_ctr_vma(struct uprobe *uprobe, struct vm_area_struct *vma) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->ref_ctr_offset); return uprobe->ref_ctr_offset && vma->vm_file && file_inode(vma->vm_file) == uprobe->inode && (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE && vma->vm_start <= vaddr && vma->vm_end > vaddr; } static struct vm_area_struct * find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm) { VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *tmp; for_each_vma(vmi, tmp) if (valid_ref_ctr_vma(uprobe, tmp)) return tmp; return NULL; } static int __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d) { void *kaddr; struct page *page; int ret; short *ptr; if (!vaddr || !d) return -EINVAL; ret = get_user_pages_remote(mm, vaddr, 1, FOLL_WRITE, &page, NULL); if (unlikely(ret <= 0)) { /* * We are asking for 1 page. If get_user_pages_remote() fails, * it may return 0, in that case we have to return error. */ return ret == 0 ? -EBUSY : ret; } kaddr = kmap_atomic(page); ptr = kaddr + (vaddr & ~PAGE_MASK); if (unlikely(*ptr + d < 0)) { pr_warn("ref_ctr going negative. vaddr: 0x%lx, " "curr val: %d, delta: %d\n", vaddr, *ptr, d); ret = -EINVAL; goto out; } *ptr += d; ret = 0; out: kunmap_atomic(kaddr); put_page(page); return ret; } static void update_ref_ctr_warn(struct uprobe *uprobe, struct mm_struct *mm, short d) { pr_warn("ref_ctr %s failed for inode: 0x%lx offset: " "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%p\n", d > 0 ? "increment" : "decrement", uprobe->inode->i_ino, (unsigned long long) uprobe->offset, (unsigned long long) uprobe->ref_ctr_offset, mm); } static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, short d) { struct vm_area_struct *rc_vma; unsigned long rc_vaddr; int ret = 0; rc_vma = find_ref_ctr_vma(uprobe, mm); if (rc_vma) { rc_vaddr = offset_to_vaddr(rc_vma, uprobe->ref_ctr_offset); ret = __update_ref_ctr(mm, rc_vaddr, d); if (ret) update_ref_ctr_warn(uprobe, mm, d); if (d > 0) return ret; } mutex_lock(&delayed_uprobe_lock); if (d > 0) ret = delayed_uprobe_add(uprobe, mm); else delayed_uprobe_remove(uprobe, mm); mutex_unlock(&delayed_uprobe_lock); return ret; } static bool orig_page_is_identical(struct vm_area_struct *vma, unsigned long vaddr, struct page *page, bool *pmd_mappable) { const pgoff_t index = vaddr_to_offset(vma, vaddr) >> PAGE_SHIFT; struct folio *orig_folio = filemap_get_folio(vma->vm_file->f_mapping, index); struct page *orig_page; bool identical; if (IS_ERR(orig_folio)) return false; orig_page = folio_file_page(orig_folio, index); *pmd_mappable = folio_test_pmd_mappable(orig_folio); identical = folio_test_uptodate(orig_folio) && pages_identical(page, orig_page); folio_put(orig_folio); return identical; } static int __uprobe_write_opcode(struct vm_area_struct *vma, struct folio_walk *fw, struct folio *folio, unsigned long opcode_vaddr, uprobe_opcode_t opcode) { const unsigned long vaddr = opcode_vaddr & PAGE_MASK; const bool is_register = !!is_swbp_insn(&opcode); bool pmd_mappable; /* For now, we'll only handle PTE-mapped folios. */ if (fw->level != FW_LEVEL_PTE) return -EFAULT; /* * See can_follow_write_pte(): we'd actually prefer a writable PTE here, * but the VMA might not be writable. */ if (!pte_write(fw->pte)) { if (!PageAnonExclusive(fw->page)) return -EFAULT; if (unlikely(userfaultfd_pte_wp(vma, fw->pte))) return -EFAULT; /* SOFTDIRTY is handled via pte_mkdirty() below. */ } /* * We'll temporarily unmap the page and flush the TLB, such that we can * modify the page atomically. */ flush_cache_page(vma, vaddr, pte_pfn(fw->pte)); fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep); copy_to_page(fw->page, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); /* * When unregistering, we may only zap a PTE if uffd is disabled and * there are no unexpected folio references ... */ if (is_register || userfaultfd_missing(vma) || (folio_ref_count(folio) != folio_expected_ref_count(folio) + 1)) goto remap; /* * ... and the mapped page is identical to the original page that * would get faulted in on next access. */ if (!orig_page_is_identical(vma, vaddr, fw->page, &pmd_mappable)) goto remap; dec_mm_counter(vma->vm_mm, MM_ANONPAGES); folio_remove_rmap_pte(folio, fw->page, vma); if (!folio_mapped(folio) && folio_test_swapcache(folio) && folio_trylock(folio)) { folio_free_swap(folio); folio_unlock(folio); } folio_put(folio); return pmd_mappable; remap: /* * Make sure that our copy_to_page() changes become visible before the * set_pte_at() write. */ smp_wmb(); /* We modified the page. Make sure to mark the PTE dirty. */ set_pte_at(vma->vm_mm, vaddr, fw->ptep, pte_mkdirty(fw->pte)); return 0; } /* * NOTE: * Expect the breakpoint instruction to be the smallest size instruction for * the architecture. If an arch has variable length instruction and the * breakpoint instruction is not of the smallest length instruction * supported by that architecture then we need to modify is_trap_at_addr and * uprobe_write_opcode accordingly. This would never be a problem for archs * that have fixed length instructions. * * uprobe_write_opcode - write the opcode at a given virtual address. * @auprobe: arch specific probepoint information. * @vma: the probed virtual memory area. * @opcode_vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @opcode_vaddr. * * Called with mm->mmap_lock held for read or write. * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, uprobe_opcode_t opcode) { const unsigned long vaddr = opcode_vaddr & PAGE_MASK; struct mm_struct *mm = vma->vm_mm; struct uprobe *uprobe; int ret, is_register, ref_ctr_updated = 0; unsigned int gup_flags = FOLL_FORCE; struct mmu_notifier_range range; struct folio_walk fw; struct folio *folio; struct page *page; is_register = is_swbp_insn(&opcode); uprobe = container_of(auprobe, struct uprobe, arch); if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags))) return -EINVAL; /* * When registering, we have to break COW to get an exclusive anonymous * page that we can safely modify. Use FOLL_WRITE to trigger a write * fault if required. When unregistering, we might be lucky and the * anon page is already gone. So defer write faults until really * required. Use FOLL_SPLIT_PMD, because __uprobe_write_opcode() * cannot deal with PMDs yet. */ if (is_register) gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD; retry: ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &page, NULL); if (ret <= 0) goto out; folio = page_folio(page); ret = verify_opcode(page, opcode_vaddr, &opcode); if (ret <= 0) { folio_put(folio); goto out; } /* We are going to replace instruction, update ref_ctr. */ if (!ref_ctr_updated && uprobe->ref_ctr_offset) { ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); if (ret) { folio_put(folio); goto out; } ref_ctr_updated = 1; } ret = 0; if (unlikely(!folio_test_anon(folio) || folio_is_zone_device(folio))) { VM_WARN_ON_ONCE(is_register); folio_put(folio); goto out; } if (!is_register) { /* * In the common case, we'll be able to zap the page when * unregistering. So trigger MMU notifiers now, as we won't * be able to do it under PTL. */ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vaddr, vaddr + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); } ret = -EAGAIN; /* Walk the page tables again, to perform the actual update. */ if (folio_walk_start(&fw, vma, vaddr, 0)) { if (fw.page == page) ret = __uprobe_write_opcode(vma, &fw, folio, opcode_vaddr, opcode); folio_walk_end(&fw, vma); } if (!is_register) mmu_notifier_invalidate_range_end(&range); folio_put(folio); switch (ret) { case -EFAULT: gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD; fallthrough; case -EAGAIN: goto retry; default: break; } out: /* Revert back reference counter if instruction update failed. */ if (ret < 0 && ref_ctr_updated) update_ref_ctr(uprobe, mm, is_register ? -1 : 1); /* try collapse pmd for compound page */ if (ret > 0) collapse_pte_mapped_thp(mm, vaddr, false); return ret < 0 ? ret : 0; } /** * set_swbp - store breakpoint at a given address. * @auprobe: arch specific probepoint information. * @vma: the probed virtual memory area. * @vaddr: the virtual address to insert the opcode. * * For mm @mm, store the breakpoint instruction at @vaddr. * Return 0 (success) or a negative errno. */ int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN); } /** * set_orig_insn - Restore the original instruction. * @vma: the probed virtual memory area. * @auprobe: arch specific probepoint information. * @vaddr: the virtual address to insert the opcode. * * For mm @mm, restore the original opcode (opcode) at @vaddr. * Return 0 (success) or a negative errno. */ int __weak set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn); } /* uprobe should have guaranteed positive refcount */ static struct uprobe *get_uprobe(struct uprobe *uprobe) { refcount_inc(&uprobe->ref); return uprobe; } /* * uprobe should have guaranteed lifetime, which can be either of: * - caller already has refcount taken (and wants an extra one); * - uprobe is RCU protected and won't be freed until after grace period; * - we are holding uprobes_treelock (for read or write, doesn't matter). */ static struct uprobe *try_get_uprobe(struct uprobe *uprobe) { if (refcount_inc_not_zero(&uprobe->ref)) return uprobe; return NULL; } static inline bool uprobe_is_active(struct uprobe *uprobe) { return !RB_EMPTY_NODE(&uprobe->rb_node); } static void uprobe_free_rcu_tasks_trace(struct rcu_head *rcu) { struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); kfree(uprobe); } static void uprobe_free_srcu(struct rcu_head *rcu) { struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu_tasks_trace); } static void uprobe_free_deferred(struct work_struct *work) { struct uprobe *uprobe = container_of(work, struct uprobe, work); write_lock(&uprobes_treelock); if (uprobe_is_active(uprobe)) { write_seqcount_begin(&uprobes_seqcount); rb_erase(&uprobe->rb_node, &uprobes_tree); write_seqcount_end(&uprobes_seqcount); } write_unlock(&uprobes_treelock); /* * If application munmap(exec_vma) before uprobe_unregister() * gets called, we don't get a chance to remove uprobe from * delayed_uprobe_list from remove_breakpoint(). Do it here. */ mutex_lock(&delayed_uprobe_lock); delayed_uprobe_remove(uprobe, NULL); mutex_unlock(&delayed_uprobe_lock); /* start srcu -> rcu_tasks_trace -> kfree chain */ call_srcu(&uretprobes_srcu, &uprobe->rcu, uprobe_free_srcu); } static void put_uprobe(struct uprobe *uprobe) { if (!refcount_dec_and_test(&uprobe->ref)) return; INIT_WORK(&uprobe->work, uprobe_free_deferred); schedule_work(&uprobe->work); } /* Initialize hprobe as SRCU-protected "leased" uprobe */ static void hprobe_init_leased(struct hprobe *hprobe, struct uprobe *uprobe, int srcu_idx) { WARN_ON(!uprobe); hprobe->state = HPROBE_LEASED; hprobe->uprobe = uprobe; hprobe->srcu_idx = srcu_idx; } /* Initialize hprobe as refcounted ("stable") uprobe (uprobe can be NULL). */ static void hprobe_init_stable(struct hprobe *hprobe, struct uprobe *uprobe) { hprobe->state = uprobe ? HPROBE_STABLE : HPROBE_GONE; hprobe->uprobe = uprobe; hprobe->srcu_idx = -1; } /* * hprobe_consume() fetches hprobe's underlying uprobe and detects whether * uprobe is SRCU protected or is refcounted. hprobe_consume() can be * used only once for a given hprobe. * * Caller has to call hprobe_finalize() and pass previous hprobe_state, so * that hprobe_finalize() can perform SRCU unlock or put uprobe, whichever * is appropriate. */ static inline struct uprobe *hprobe_consume(struct hprobe *hprobe, enum hprobe_state *hstate) { *hstate = xchg(&hprobe->state, HPROBE_CONSUMED); switch (*hstate) { case HPROBE_LEASED: case HPROBE_STABLE: return hprobe->uprobe; case HPROBE_GONE: /* uprobe is NULL, no SRCU */ case HPROBE_CONSUMED: /* uprobe was finalized already, do nothing */ return NULL; default: WARN(1, "hprobe invalid state %d", *hstate); return NULL; } } /* * Reset hprobe state and, if hprobe was LEASED, release SRCU lock. * hprobe_finalize() can only be used from current context after * hprobe_consume() call (which determines uprobe and hstate value). */ static void hprobe_finalize(struct hprobe *hprobe, enum hprobe_state hstate) { switch (hstate) { case HPROBE_LEASED: __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx); break; case HPROBE_STABLE: put_uprobe(hprobe->uprobe); break; case HPROBE_GONE: case HPROBE_CONSUMED: break; default: WARN(1, "hprobe invalid state %d", hstate); break; } } /* * Attempt to switch (atomically) uprobe from being SRCU protected (LEASED) * to refcounted (STABLE) state. Competes with hprobe_consume(); only one of * them can win the race to perform SRCU unlocking. Whoever wins must perform * SRCU unlock. * * Returns underlying valid uprobe or NULL, if there was no underlying uprobe * to begin with or we failed to bump its refcount and it's going away. * * Returned non-NULL uprobe can be still safely used within an ongoing SRCU * locked region. If `get` is true, it's guaranteed that non-NULL uprobe has * an extra refcount for caller to assume and use. Otherwise, it's not * guaranteed that returned uprobe has a positive refcount, so caller has to * attempt try_get_uprobe(), if it needs to preserve uprobe beyond current * SRCU lock region. See dup_utask(). */ static struct uprobe *hprobe_expire(struct hprobe *hprobe, bool get) { enum hprobe_state hstate; /* * Caller should guarantee that return_instance is not going to be * freed from under us. This can be achieved either through holding * rcu_read_lock() or by owning return_instance in the first place. * * Underlying uprobe is itself protected from reuse by SRCU, so ensure * SRCU lock is held properly. */ lockdep_assert(srcu_read_lock_held(&uretprobes_srcu)); hstate = READ_ONCE(hprobe->state); switch (hstate) { case HPROBE_STABLE: /* uprobe has positive refcount, bump refcount, if necessary */ return get ? get_uprobe(hprobe->uprobe) : hprobe->uprobe; case HPROBE_GONE: /* * SRCU was unlocked earlier and we didn't manage to take * uprobe refcnt, so it's effectively NULL */ return NULL; case HPROBE_CONSUMED: /* * uprobe was consumed, so it's effectively NULL as far as * uretprobe processing logic is concerned */ return NULL; case HPROBE_LEASED: { struct uprobe *uprobe = try_get_uprobe(hprobe->uprobe); /* * Try to switch hprobe state, guarding against * hprobe_consume() or another hprobe_expire() racing with us. * Note, if we failed to get uprobe refcount, we use special * HPROBE_GONE state to signal that hprobe->uprobe shouldn't * be used as it will be freed after SRCU is unlocked. */ if (try_cmpxchg(&hprobe->state, &hstate, uprobe ? HPROBE_STABLE : HPROBE_GONE)) { /* We won the race, we are the ones to unlock SRCU */ __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx); return get ? get_uprobe(uprobe) : uprobe; } /* * We lost the race, undo refcount bump (if it ever happened), * unless caller would like an extra refcount anyways. */ if (uprobe && !get) put_uprobe(uprobe); /* * Even if hprobe_consume() or another hprobe_expire() wins * the state update race and unlocks SRCU from under us, we * still have a guarantee that underyling uprobe won't be * freed due to ongoing caller's SRCU lock region, so we can * return it regardless. Also, if `get` was true, we also have * an extra ref for the caller to own. This is used in dup_utask(). */ return uprobe; } default: WARN(1, "unknown hprobe state %d", hstate); return NULL; } } static __always_inline int uprobe_cmp(const struct inode *l_inode, const loff_t l_offset, const struct uprobe *r) { if (l_inode < r->inode) return -1; if (l_inode > r->inode) return 1; if (l_offset < r->offset) return -1; if (l_offset > r->offset) return 1; return 0; } #define __node_2_uprobe(node) \ rb_entry((node), struct uprobe, rb_node) struct __uprobe_key { struct inode *inode; loff_t offset; }; static inline int __uprobe_cmp_key(const void *key, const struct rb_node *b) { const struct __uprobe_key *a = key; return uprobe_cmp(a->inode, a->offset, __node_2_uprobe(b)); } static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b) { struct uprobe *u = __node_2_uprobe(a); return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b)); } /* * Assumes being inside RCU protected region. * No refcount is taken on returned uprobe. */ static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset) { struct __uprobe_key key = { .inode = inode, .offset = offset, }; struct rb_node *node; unsigned int seq; lockdep_assert(rcu_read_lock_trace_held()); do { seq = read_seqcount_begin(&uprobes_seqcount); node = rb_find_rcu(&key, &uprobes_tree, __uprobe_cmp_key); /* * Lockless RB-tree lookups can result only in false negatives. * If the element is found, it is correct and can be returned * under RCU protection. If we find nothing, we need to * validate that seqcount didn't change. If it did, we have to * try again as we might have missed the element (false * negative). If seqcount is unchanged, search truly failed. */ if (node) return __node_2_uprobe(node); } while (read_seqcount_retry(&uprobes_seqcount, seq)); return NULL; } /* * Attempt to insert a new uprobe into uprobes_tree. * * If uprobe already exists (for given inode+offset), we just increment * refcount of previously existing uprobe. * * If not, a provided new instance of uprobe is inserted into the tree (with * assumed initial refcount == 1). * * In any case, we return a uprobe instance that ends up being in uprobes_tree. * Caller has to clean up new uprobe instance, if it ended up not being * inserted into the tree. * * We assume that uprobes_treelock is held for writing. */ static struct uprobe *__insert_uprobe(struct uprobe *uprobe) { struct rb_node *node; again: node = rb_find_add_rcu(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp); if (node) { struct uprobe *u = __node_2_uprobe(node); if (!try_get_uprobe(u)) { rb_erase(node, &uprobes_tree); RB_CLEAR_NODE(&u->rb_node); goto again; } return u; } return uprobe; } /* * Acquire uprobes_treelock and insert uprobe into uprobes_tree * (or reuse existing one, see __insert_uprobe() comments above). */ static struct uprobe *insert_uprobe(struct uprobe *uprobe) { struct uprobe *u; write_lock(&uprobes_treelock); write_seqcount_begin(&uprobes_seqcount); u = __insert_uprobe(uprobe); write_seqcount_end(&uprobes_seqcount); write_unlock(&uprobes_treelock); return u; } static void ref_ctr_mismatch_warn(struct uprobe *cur_uprobe, struct uprobe *uprobe) { pr_warn("ref_ctr_offset mismatch. inode: 0x%lx offset: 0x%llx " "ref_ctr_offset(old): 0x%llx ref_ctr_offset(new): 0x%llx\n", uprobe->inode->i_ino, (unsigned long long) uprobe->offset, (unsigned long long) cur_uprobe->ref_ctr_offset, (unsigned long long) uprobe->ref_ctr_offset); } static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset, loff_t ref_ctr_offset) { struct uprobe *uprobe, *cur_uprobe; uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL); if (!uprobe) return ERR_PTR(-ENOMEM); uprobe->inode = inode; uprobe->offset = offset; uprobe->ref_ctr_offset = ref_ctr_offset; INIT_LIST_HEAD(&uprobe->consumers); init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); RB_CLEAR_NODE(&uprobe->rb_node); refcount_set(&uprobe->ref, 1); /* add to uprobes_tree, sorted on inode:offset */ cur_uprobe = insert_uprobe(uprobe); /* a uprobe exists for this inode:offset combination */ if (cur_uprobe != uprobe) { if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) { ref_ctr_mismatch_warn(cur_uprobe, uprobe); put_uprobe(cur_uprobe); kfree(uprobe); return ERR_PTR(-EINVAL); } kfree(uprobe); uprobe = cur_uprobe; } return uprobe; } static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) { static atomic64_t id; down_write(&uprobe->consumer_rwsem); list_add_rcu(&uc->cons_node, &uprobe->consumers); uc->id = (__u64) atomic64_inc_return(&id); up_write(&uprobe->consumer_rwsem); } /* * For uprobe @uprobe, delete the consumer @uc. * Should never be called with consumer that's not part of @uprobe->consumers. */ static void consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) { down_write(&uprobe->consumer_rwsem); list_del_rcu(&uc->cons_node); up_write(&uprobe->consumer_rwsem); } static int __copy_insn(struct address_space *mapping, struct file *filp, void *insn, int nbytes, loff_t offset) { struct page *page; /* * Ensure that the page that has the original instruction is populated * and in page-cache. If ->read_folio == NULL it must be shmem_mapping(), * see uprobe_register(). */ if (mapping->a_ops->read_folio) page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp); else page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT); if (IS_ERR(page)) return PTR_ERR(page); copy_from_page(page, offset, insn, nbytes); put_page(page); return 0; } static int copy_insn(struct uprobe *uprobe, struct file *filp) { struct address_space *mapping = uprobe->inode->i_mapping; loff_t offs = uprobe->offset; void *insn = &uprobe->arch.insn; int size = sizeof(uprobe->arch.insn); int len, err = -EIO; /* Copy only available bytes, -EIO if nothing was read */ do { if (offs >= i_size_read(uprobe->inode)) break; len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK)); err = __copy_insn(mapping, filp, insn, len, offs); if (err) break; insn += len; offs += len; size -= len; } while (size); return err; } static int prepare_uprobe(struct uprobe *uprobe, struct file *file, struct mm_struct *mm, unsigned long vaddr) { int ret = 0; if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) return ret; /* TODO: move this into _register, until then we abuse this sem. */ down_write(&uprobe->consumer_rwsem); if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) goto out; ret = copy_insn(uprobe, file); if (ret) goto out; ret = -ENOTSUPP; if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn)) goto out; ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); if (ret) goto out; smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */ set_bit(UPROBE_COPY_INSN, &uprobe->flags); out: up_write(&uprobe->consumer_rwsem); return ret; } static inline bool consumer_filter(struct uprobe_consumer *uc, struct mm_struct *mm) { return !uc->filter || uc->filter(uc, mm); } static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm) { struct uprobe_consumer *uc; bool ret = false; down_read(&uprobe->consumer_rwsem); list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { ret = consumer_filter(uc, mm); if (ret) break; } up_read(&uprobe->consumer_rwsem); return ret; } static int install_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long vaddr) { struct mm_struct *mm = vma->vm_mm; bool first_uprobe; int ret; ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr); if (ret) return ret; /* * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(), * the task can hit this breakpoint right after __replace_page(). */ first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags); if (first_uprobe) set_bit(MMF_HAS_UPROBES, &mm->flags); ret = set_swbp(&uprobe->arch, vma, vaddr); if (!ret) clear_bit(MMF_RECALC_UPROBES, &mm->flags); else if (first_uprobe) clear_bit(MMF_HAS_UPROBES, &mm->flags); return ret; } static int remove_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long vaddr) { struct mm_struct *mm = vma->vm_mm; set_bit(MMF_RECALC_UPROBES, &mm->flags); return set_orig_insn(&uprobe->arch, vma, vaddr); } struct map_info { struct map_info *next; struct mm_struct *mm; unsigned long vaddr; }; static inline struct map_info *free_map_info(struct map_info *info) { struct map_info *next = info->next; kfree(info); return next; } static struct map_info * build_map_info(struct address_space *mapping, loff_t offset, bool is_register) { unsigned long pgoff = offset >> PAGE_SHIFT; struct vm_area_struct *vma; struct map_info *curr = NULL; struct map_info *prev = NULL; struct map_info *info; int more = 0; again: i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { if (!valid_vma(vma, is_register)) continue; if (!prev && !more) { /* * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through * reclaim. This is optimistic, no harm done if it fails. */ prev = kmalloc(sizeof(struct map_info), GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN); if (prev) prev->next = NULL; } if (!prev) { more++; continue; } if (!mmget_not_zero(vma->vm_mm)) continue; info = prev; prev = prev->next; info->next = curr; curr = info; info->mm = vma->vm_mm; info->vaddr = offset_to_vaddr(vma, offset); } i_mmap_unlock_read(mapping); if (!more) goto out; prev = curr; while (curr) { mmput(curr->mm); curr = curr->next; } do { info = kmalloc(sizeof(struct map_info), GFP_KERNEL); if (!info) { curr = ERR_PTR(-ENOMEM); goto out; } info->next = prev; prev = info; } while (--more); goto again; out: while (prev) prev = free_map_info(prev); return curr; } static int register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) { bool is_register = !!new; struct map_info *info; int err = 0; percpu_down_write(&dup_mmap_sem); info = build_map_info(uprobe->inode->i_mapping, uprobe->offset, is_register); if (IS_ERR(info)) { err = PTR_ERR(info); goto out; } while (info) { struct mm_struct *mm = info->mm; struct vm_area_struct *vma; if (err && is_register) goto free; /* * We take mmap_lock for writing to avoid the race with * find_active_uprobe_rcu() which takes mmap_lock for reading. * Thus this install_breakpoint() can not make * is_trap_at_addr() true right after find_uprobe_rcu() * returns NULL in find_active_uprobe_rcu(). */ mmap_write_lock(mm); if (check_stable_address_space(mm)) goto unlock; vma = find_vma(mm, info->vaddr); if (!vma || !valid_vma(vma, is_register) || file_inode(vma->vm_file) != uprobe->inode) goto unlock; if (vma->vm_start > info->vaddr || vaddr_to_offset(vma, info->vaddr) != uprobe->offset) goto unlock; if (is_register) { /* consult only the "caller", new consumer. */ if (consumer_filter(new, mm)) err = install_breakpoint(uprobe, vma, info->vaddr); } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { if (!filter_chain(uprobe, mm)) err |= remove_breakpoint(uprobe, vma, info->vaddr); } unlock: mmap_write_unlock(mm); free: mmput(mm); info = free_map_info(info); } out: percpu_up_write(&dup_mmap_sem); return err; } /** * uprobe_unregister_nosync - unregister an already registered probe. * @uprobe: uprobe to remove * @uc: identify which probe if multiple probes are colocated. */ void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc) { int err; down_write(&uprobe->register_rwsem); consumer_del(uprobe, uc); err = register_for_each_vma(uprobe, NULL); up_write(&uprobe->register_rwsem); /* TODO : cant unregister? schedule a worker thread */ if (unlikely(err)) { uprobe_warn(current, "unregister, leaking uprobe"); return; } put_uprobe(uprobe); } EXPORT_SYMBOL_GPL(uprobe_unregister_nosync); void uprobe_unregister_sync(void) { /* * Now that handler_chain() and handle_uretprobe_chain() iterate over * uprobe->consumers list under RCU protection without holding * uprobe->register_rwsem, we need to wait for RCU grace period to * make sure that we can't call into just unregistered * uprobe_consumer's callbacks anymore. If we don't do that, fast and * unlucky enough caller can free consumer's memory and cause * handler_chain() or handle_uretprobe_chain() to do an use-after-free. */ synchronize_rcu_tasks_trace(); synchronize_srcu(&uretprobes_srcu); } EXPORT_SYMBOL_GPL(uprobe_unregister_sync); /** * uprobe_register - register a probe * @inode: the file in which the probe has to be placed. * @offset: offset from the start of the file. * @ref_ctr_offset: offset of SDT marker / reference counter * @uc: information on howto handle the probe.. * * Apart from the access refcount, uprobe_register() takes a creation * refcount (thro alloc_uprobe) if and only if this @uprobe is getting * inserted into the rbtree (i.e first consumer for a @inode:@offset * tuple). Creation refcount stops uprobe_unregister from freeing the * @uprobe even before the register operation is complete. Creation * refcount is released when the last @uc for the @uprobe * unregisters. Caller of uprobe_register() is required to keep @inode * (and the containing mount) referenced. * * Return: pointer to the new uprobe on success or an ERR_PTR on failure. */ struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc) { struct uprobe *uprobe; int ret; /* Uprobe must have at least one set consumer */ if (!uc->handler && !uc->ret_handler) return ERR_PTR(-EINVAL); /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */ if (!inode->i_mapping->a_ops->read_folio && !shmem_mapping(inode->i_mapping)) return ERR_PTR(-EIO); /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) return ERR_PTR(-EINVAL); /* * This ensures that copy_from_page(), copy_to_page() and * __update_ref_ctr() can't cross page boundary. */ if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE)) return ERR_PTR(-EINVAL); if (!IS_ALIGNED(ref_ctr_offset, sizeof(short))) return ERR_PTR(-EINVAL); uprobe = alloc_uprobe(inode, offset, ref_ctr_offset); if (IS_ERR(uprobe)) return uprobe; down_write(&uprobe->register_rwsem); consumer_add(uprobe, uc); ret = register_for_each_vma(uprobe, uc); up_write(&uprobe->register_rwsem); if (ret) { uprobe_unregister_nosync(uprobe, uc); /* * Registration might have partially succeeded, so we can have * this consumer being called right at this time. We need to * sync here. It's ok, it's unlikely slow path. */ uprobe_unregister_sync(); return ERR_PTR(ret); } return uprobe; } EXPORT_SYMBOL_GPL(uprobe_register); /** * uprobe_apply - add or remove the breakpoints according to @uc->filter * @uprobe: uprobe which "owns" the breakpoint * @uc: consumer which wants to add more or remove some breakpoints * @add: add or remove the breakpoints * Return: 0 on success or negative error code. */ int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add) { struct uprobe_consumer *con; int ret = -ENOENT; down_write(&uprobe->register_rwsem); rcu_read_lock_trace(); list_for_each_entry_rcu(con, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { if (con == uc) { ret = register_for_each_vma(uprobe, add ? uc : NULL); break; } } rcu_read_unlock_trace(); up_write(&uprobe->register_rwsem); return ret; } static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) { VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; int err = 0; mmap_read_lock(mm); for_each_vma(vmi, vma) { unsigned long vaddr; loff_t offset; if (!valid_vma(vma, false) || file_inode(vma->vm_file) != uprobe->inode) continue; offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; if (uprobe->offset < offset || uprobe->offset >= offset + vma->vm_end - vma->vm_start) continue; vaddr = offset_to_vaddr(vma, uprobe->offset); err |= remove_breakpoint(uprobe, vma, vaddr); } mmap_read_unlock(mm); return err; } static struct rb_node * find_node_in_range(struct inode *inode, loff_t min, loff_t max) { struct rb_node *n = uprobes_tree.rb_node; while (n) { struct uprobe *u = rb_entry(n, struct uprobe, rb_node); if (inode < u->inode) { n = n->rb_left; } else if (inode > u->inode) { n = n->rb_right; } else { if (max < u->offset) n = n->rb_left; else if (min > u->offset) n = n->rb_right; else break; } } return n; } /* * For a given range in vma, build a list of probes that need to be inserted. */ static void build_probe_list(struct inode *inode, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct list_head *head) { loff_t min, max; struct rb_node *n, *t; struct uprobe *u; INIT_LIST_HEAD(head); min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; read_lock(&uprobes_treelock); n = find_node_in_range(inode, min, max); if (n) { for (t = n; t; t = rb_prev(t)) { u = rb_entry(t, struct uprobe, rb_node); if (u->inode != inode || u->offset < min) break; /* if uprobe went away, it's safe to ignore it */ if (try_get_uprobe(u)) list_add(&u->pending_list, head); } for (t = n; (t = rb_next(t)); ) { u = rb_entry(t, struct uprobe, rb_node); if (u->inode != inode || u->offset > max) break; /* if uprobe went away, it's safe to ignore it */ if (try_get_uprobe(u)) list_add(&u->pending_list, head); } } read_unlock(&uprobes_treelock); } /* @vma contains reference counter, not the probed instruction. */ static int delayed_ref_ctr_inc(struct vm_area_struct *vma) { struct list_head *pos, *q; struct delayed_uprobe *du; unsigned long vaddr; int ret = 0, err = 0; mutex_lock(&delayed_uprobe_lock); list_for_each_safe(pos, q, &delayed_uprobe_list) { du = list_entry(pos, struct delayed_uprobe, list); if (du->mm != vma->vm_mm || !valid_ref_ctr_vma(du->uprobe, vma)) continue; vaddr = offset_to_vaddr(vma, du->uprobe->ref_ctr_offset); ret = __update_ref_ctr(vma->vm_mm, vaddr, 1); if (ret) { update_ref_ctr_warn(du->uprobe, vma->vm_mm, 1); if (!err) err = ret; } delayed_uprobe_delete(du); } mutex_unlock(&delayed_uprobe_lock); return err; } /* * Called from mmap_region/vma_merge with mm->mmap_lock acquired. * * Currently we ignore all errors and always return 0, the callers * can't handle the failure anyway. */ int uprobe_mmap(struct vm_area_struct *vma) { struct list_head tmp_list; struct uprobe *uprobe, *u; struct inode *inode; if (no_uprobe_events()) return 0; if (vma->vm_file && (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE && test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags)) delayed_ref_ctr_inc(vma); if (!valid_vma(vma, true)) return 0; inode = file_inode(vma->vm_file); if (!inode) return 0; mutex_lock(uprobes_mmap_hash(inode)); build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); /* * We can race with uprobe_unregister(), this uprobe can be already * removed. But in this case filter_chain() must return false, all * consumers have gone away. */ list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { if (!fatal_signal_pending(current) && filter_chain(uprobe, vma->vm_mm)) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); install_breakpoint(uprobe, vma, vaddr); } put_uprobe(uprobe); } mutex_unlock(uprobes_mmap_hash(inode)); return 0; } static bool vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end) { loff_t min, max; struct inode *inode; struct rb_node *n; inode = file_inode(vma->vm_file); min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; read_lock(&uprobes_treelock); n = find_node_in_range(inode, min, max); read_unlock(&uprobes_treelock); return !!n; } /* * Called in context of a munmap of a vma. */ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) { if (no_uprobe_events() || !valid_vma(vma, false)) return; if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ return; if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) || test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags)) return; if (vma_has_uprobes(vma, start, end)) set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags); } static vm_fault_t xol_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf) { struct xol_area *area = vma->vm_mm->uprobes_state.xol_area; vmf->page = area->page; get_page(vmf->page); return 0; } static int xol_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { return -EPERM; } static const struct vm_special_mapping xol_mapping = { .name = "[uprobes]", .fault = xol_fault, .mremap = xol_mremap, }; /* Slot allocation for XOL */ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) { struct vm_area_struct *vma; int ret; if (mmap_write_lock_killable(mm)) return -EINTR; if (mm->uprobes_state.xol_area) { ret = -EALREADY; goto fail; } if (!area->vaddr) { /* Try to map as high as possible, this is only a hint. */ area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); if (IS_ERR_VALUE(area->vaddr)) { ret = area->vaddr; goto fail; } } vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE, VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO| VM_SEALED_SYSMAP, &xol_mapping); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto fail; } ret = 0; /* pairs with get_xol_area() */ smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */ fail: mmap_write_unlock(mm); return ret; } void * __weak arch_uprobe_trampoline(unsigned long *psize) { static uprobe_opcode_t insn = UPROBE_SWBP_INSN; *psize = UPROBE_SWBP_INSN_SIZE; return &insn; } static struct xol_area *__create_xol_area(unsigned long vaddr) { struct mm_struct *mm = current->mm; unsigned long insns_size; struct xol_area *area; void *insns; area = kzalloc(sizeof(*area), GFP_KERNEL); if (unlikely(!area)) goto out; area->bitmap = kcalloc(BITS_TO_LONGS(UINSNS_PER_PAGE), sizeof(long), GFP_KERNEL); if (!area->bitmap) goto free_area; area->page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); if (!area->page) goto free_bitmap; area->vaddr = vaddr; init_waitqueue_head(&area->wq); /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); insns = arch_uprobe_trampoline(&insns_size); arch_uprobe_copy_ixol(area->page, 0, insns, insns_size); if (!xol_add_vma(mm, area)) return area; __free_page(area->page); free_bitmap: kfree(area->bitmap); free_area: kfree(area); out: return NULL; } /* * get_xol_area - Allocate process's xol_area if necessary. * This area will be used for storing instructions for execution out of line. * * Returns the allocated area or NULL. */ static struct xol_area *get_xol_area(void) { struct mm_struct *mm = current->mm; struct xol_area *area; if (!mm->uprobes_state.xol_area) __create_xol_area(0); /* Pairs with xol_add_vma() smp_store_release() */ area = READ_ONCE(mm->uprobes_state.xol_area); /* ^^^ */ return area; } /* * uprobe_clear_state - Free the area allocated for slots. */ void uprobe_clear_state(struct mm_struct *mm) { struct xol_area *area = mm->uprobes_state.xol_area; mutex_lock(&delayed_uprobe_lock); delayed_uprobe_remove(NULL, mm); mutex_unlock(&delayed_uprobe_lock); if (!area) return; put_page(area->page); kfree(area->bitmap); kfree(area); } void uprobe_start_dup_mmap(void) { percpu_down_read(&dup_mmap_sem); } void uprobe_end_dup_mmap(void) { percpu_up_read(&dup_mmap_sem); } void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) { if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) { set_bit(MMF_HAS_UPROBES, &newmm->flags); /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */ set_bit(MMF_RECALC_UPROBES, &newmm->flags); } } static unsigned long xol_get_slot_nr(struct xol_area *area) { unsigned long slot_nr; slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE); if (slot_nr < UINSNS_PER_PAGE) { if (!test_and_set_bit(slot_nr, area->bitmap)) return slot_nr; } return UINSNS_PER_PAGE; } /* * xol_get_insn_slot - allocate a slot for xol. */ static bool xol_get_insn_slot(struct uprobe *uprobe, struct uprobe_task *utask) { struct xol_area *area = get_xol_area(); unsigned long slot_nr; if (!area) return false; wait_event(area->wq, (slot_nr = xol_get_slot_nr(area)) < UINSNS_PER_PAGE); utask->xol_vaddr = area->vaddr + slot_nr * UPROBE_XOL_SLOT_BYTES; arch_uprobe_copy_ixol(area->page, utask->xol_vaddr, &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); return true; } /* * xol_free_insn_slot - free the slot allocated by xol_get_insn_slot() */ static void xol_free_insn_slot(struct uprobe_task *utask) { struct xol_area *area = current->mm->uprobes_state.xol_area; unsigned long offset = utask->xol_vaddr - area->vaddr; unsigned int slot_nr; utask->xol_vaddr = 0; /* xol_vaddr must fit into [area->vaddr, area->vaddr + PAGE_SIZE) */ if (WARN_ON_ONCE(offset >= PAGE_SIZE)) return; slot_nr = offset / UPROBE_XOL_SLOT_BYTES; clear_bit(slot_nr, area->bitmap); smp_mb__after_atomic(); /* pairs with prepare_to_wait() */ if (waitqueue_active(&area->wq)) wake_up(&area->wq); } void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *src, unsigned long len) { /* Initialize the slot */ copy_to_page(page, vaddr, src, len); /* * We probably need flush_icache_user_page() but it needs vma. * This should work on most of architectures by default. If * architecture needs to do something different it can define * its own version of the function. */ flush_dcache_page(page); } /** * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs * @regs: Reflects the saved state of the task after it has hit a breakpoint * instruction. * Return the address of the breakpoint instruction. */ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs) { return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE; } unsigned long uprobe_get_trap_addr(struct pt_regs *regs) { struct uprobe_task *utask = current->utask; if (unlikely(utask && utask->active_uprobe)) return utask->vaddr; return instruction_pointer(regs); } static void ri_pool_push(struct uprobe_task *utask, struct return_instance *ri) { ri->cons_cnt = 0; ri->next = utask->ri_pool; utask->ri_pool = ri; } static struct return_instance *ri_pool_pop(struct uprobe_task *utask) { struct return_instance *ri = utask->ri_pool; if (likely(ri)) utask->ri_pool = ri->next; return ri; } static void ri_free(struct return_instance *ri) { kfree(ri->extra_consumers); kfree_rcu(ri, rcu); } static void free_ret_instance(struct uprobe_task *utask, struct return_instance *ri, bool cleanup_hprobe) { unsigned seq; if (cleanup_hprobe) { enum hprobe_state hstate; (void)hprobe_consume(&ri->hprobe, &hstate); hprobe_finalize(&ri->hprobe, hstate); } /* * At this point return_instance is unlinked from utask's * return_instances list and this has become visible to ri_timer(). * If seqcount now indicates that ri_timer's return instance * processing loop isn't active, we can return ri into the pool of * to-be-reused return instances for future uretprobes. If ri_timer() * happens to be running right now, though, we fallback to safety and * just perform RCU-delated freeing of ri. * Admittedly, this is a rather simple use of seqcount, but it nicely * abstracts away all the necessary memory barriers, so we use * a well-supported kernel primitive here. */ if (raw_seqcount_try_begin(&utask->ri_seqcount, seq)) { /* immediate reuse of ri without RCU GP is OK */ ri_pool_push(utask, ri); } else { /* we might be racing with ri_timer(), so play it safe */ ri_free(ri); } } /* * Called with no locks held. * Called in context of an exiting or an exec-ing thread. */ void uprobe_free_utask(struct task_struct *t) { struct uprobe_task *utask = t->utask; struct return_instance *ri, *ri_next; if (!utask) return; t->utask = NULL; WARN_ON_ONCE(utask->active_uprobe || utask->xol_vaddr); timer_delete_sync(&utask->ri_timer); ri = utask->return_instances; while (ri) { ri_next = ri->next; free_ret_instance(utask, ri, true /* cleanup_hprobe */); ri = ri_next; } /* free_ret_instance() above might add to ri_pool, so this loop should come last */ ri = utask->ri_pool; while (ri) { ri_next = ri->next; ri_free(ri); ri = ri_next; } kfree(utask); } #define RI_TIMER_PERIOD (HZ / 10) /* 100 ms */ #define for_each_ret_instance_rcu(pos, head) \ for (pos = rcu_dereference_raw(head); pos; pos = rcu_dereference_raw(pos->next)) static void ri_timer(struct timer_list *timer) { struct uprobe_task *utask = container_of(timer, struct uprobe_task, ri_timer); struct return_instance *ri; /* SRCU protects uprobe from reuse for the cmpxchg() inside hprobe_expire(). */ guard(srcu)(&uretprobes_srcu); /* RCU protects return_instance from freeing. */ guard(rcu)(); /* * See free_ret_instance() for notes on seqcount use. * We also employ raw API variants to avoid lockdep false-positive * warning complaining about enabled preemption. The timer can only be * invoked once for a uprobe_task. Therefore there can only be one * writer. The reader does not require an even sequence count to make * progress, so it is OK to remain preemptible on PREEMPT_RT. */ raw_write_seqcount_begin(&utask->ri_seqcount); for_each_ret_instance_rcu(ri, utask->return_instances) hprobe_expire(&ri->hprobe, false); raw_write_seqcount_end(&utask->ri_seqcount); } static struct uprobe_task *alloc_utask(void) { struct uprobe_task *utask; utask = kzalloc(sizeof(*utask), GFP_KERNEL); if (!utask) return NULL; timer_setup(&utask->ri_timer, ri_timer, 0); seqcount_init(&utask->ri_seqcount); return utask; } /* * Allocate a uprobe_task object for the task if necessary. * Called when the thread hits a breakpoint. * * Returns: * - pointer to new uprobe_task on success * - NULL otherwise */ static struct uprobe_task *get_utask(void) { if (!current->utask) current->utask = alloc_utask(); return current->utask; } static struct return_instance *alloc_return_instance(struct uprobe_task *utask) { struct return_instance *ri; ri = ri_pool_pop(utask); if (ri) return ri; ri = kzalloc(sizeof(*ri), GFP_KERNEL); if (!ri) return ZERO_SIZE_PTR; return ri; } static struct return_instance *dup_return_instance(struct return_instance *old) { struct return_instance *ri; ri = kmemdup(old, sizeof(*ri), GFP_KERNEL); if (!ri) return NULL; if (unlikely(old->cons_cnt > 1)) { ri->extra_consumers = kmemdup(old->extra_consumers, sizeof(ri->extra_consumers[0]) * (old->cons_cnt - 1), GFP_KERNEL); if (!ri->extra_consumers) { kfree(ri); return NULL; } } return ri; } static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) { struct uprobe_task *n_utask; struct return_instance **p, *o, *n; struct uprobe *uprobe; n_utask = alloc_utask(); if (!n_utask) return -ENOMEM; t->utask = n_utask; /* protect uprobes from freeing, we'll need try_get_uprobe() them */ guard(srcu)(&uretprobes_srcu); p = &n_utask->return_instances; for (o = o_utask->return_instances; o; o = o->next) { n = dup_return_instance(o); if (!n) return -ENOMEM; /* if uprobe is non-NULL, we'll have an extra refcount for uprobe */ uprobe = hprobe_expire(&o->hprobe, true); /* * New utask will have stable properly refcounted uprobe or * NULL. Even if we failed to get refcounted uprobe, we still * need to preserve full set of return_instances for proper * uretprobe handling and nesting in forked task. */ hprobe_init_stable(&n->hprobe, uprobe); n->next = NULL; rcu_assign_pointer(*p, n); p = &n->next; n_utask->depth++; } return 0; } static void dup_xol_work(struct callback_head *work) { if (current->flags & PF_EXITING) return; if (!__create_xol_area(current->utask->dup_xol_addr) && !fatal_signal_pending(current)) uprobe_warn(current, "dup xol area"); } /* * Called in context of a new clone/fork from copy_process. */ void uprobe_copy_process(struct task_struct *t, unsigned long flags) { struct uprobe_task *utask = current->utask; struct mm_struct *mm = current->mm; struct xol_area *area; t->utask = NULL; if (!utask || !utask->return_instances) return; if (mm == t->mm && !(flags & CLONE_VFORK)) return; if (dup_utask(t, utask)) return uprobe_warn(t, "dup ret instances"); /* The task can fork() after dup_xol_work() fails */ area = mm->uprobes_state.xol_area; if (!area) return uprobe_warn(t, "dup xol area"); if (mm == t->mm) return; t->utask->dup_xol_addr = area->vaddr; init_task_work(&t->utask->dup_xol_work, dup_xol_work); task_work_add(t, &t->utask->dup_xol_work, TWA_RESUME); } /* * Current area->vaddr notion assume the trampoline address is always * equal area->vaddr. * * Returns -1 in case the xol_area is not allocated. */ unsigned long uprobe_get_trampoline_vaddr(void) { unsigned long trampoline_vaddr = UPROBE_NO_TRAMPOLINE_VADDR; struct xol_area *area; /* Pairs with xol_add_vma() smp_store_release() */ area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */ if (area) trampoline_vaddr = area->vaddr; return trampoline_vaddr; } static void cleanup_return_instances(struct uprobe_task *utask, bool chained, struct pt_regs *regs) { struct return_instance *ri = utask->return_instances, *ri_next; enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL; while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) { ri_next = ri->next; rcu_assign_pointer(utask->return_instances, ri_next); utask->depth--; free_ret_instance(utask, ri, true /* cleanup_hprobe */); ri = ri_next; } } static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs, struct return_instance *ri) { struct uprobe_task *utask = current->utask; unsigned long orig_ret_vaddr, trampoline_vaddr; bool chained; int srcu_idx; if (!get_xol_area()) goto free; if (utask->depth >= MAX_URETPROBE_DEPTH) { printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to" " nestedness limit pid/tgid=%d/%d\n", current->pid, current->tgid); goto free; } trampoline_vaddr = uprobe_get_trampoline_vaddr(); orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); if (orig_ret_vaddr == -1) goto free; /* drop the entries invalidated by longjmp() */ chained = (orig_ret_vaddr == trampoline_vaddr); cleanup_return_instances(utask, chained, regs); /* * We don't want to keep trampoline address in stack, rather keep the * original return address of first caller thru all the consequent * instances. This also makes breakpoint unwrapping easier. */ if (chained) { if (!utask->return_instances) { /* * This situation is not possible. Likely we have an * attack from user-space. */ uprobe_warn(current, "handle tail call"); goto free; } orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; } /* __srcu_read_lock() because SRCU lock survives switch to user space */ srcu_idx = __srcu_read_lock(&uretprobes_srcu); ri->func = instruction_pointer(regs); ri->stack = user_stack_pointer(regs); ri->orig_ret_vaddr = orig_ret_vaddr; ri->chained = chained; utask->depth++; hprobe_init_leased(&ri->hprobe, uprobe, srcu_idx); ri->next = utask->return_instances; rcu_assign_pointer(utask->return_instances, ri); mod_timer(&utask->ri_timer, jiffies + RI_TIMER_PERIOD); return; free: ri_free(ri); } /* Prepare to single-step probed instruction out of line. */ static int pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) { struct uprobe_task *utask = current->utask; int err; if (!try_get_uprobe(uprobe)) return -EINVAL; if (!xol_get_insn_slot(uprobe, utask)) { err = -ENOMEM; goto err_out; } utask->vaddr = bp_vaddr; err = arch_uprobe_pre_xol(&uprobe->arch, regs); if (unlikely(err)) { xol_free_insn_slot(utask); goto err_out; } utask->active_uprobe = uprobe; utask->state = UTASK_SSTEP; return 0; err_out: put_uprobe(uprobe); return err; } /* * If we are singlestepping, then ensure this thread is not connected to * non-fatal signals until completion of singlestep. When xol insn itself * triggers the signal, restart the original insn even if the task is * already SIGKILL'ed (since coredump should report the correct ip). This * is even more important if the task has a handler for SIGSEGV/etc, The * _same_ instruction should be repeated again after return from the signal * handler, and SSTEP can never finish in this case. */ bool uprobe_deny_signal(void) { struct task_struct *t = current; struct uprobe_task *utask = t->utask; if (likely(!utask || !utask->active_uprobe)) return false; WARN_ON_ONCE(utask->state != UTASK_SSTEP); if (task_sigpending(t)) { utask->signal_denied = true; clear_tsk_thread_flag(t, TIF_SIGPENDING); if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) { utask->state = UTASK_SSTEP_TRAPPED; set_tsk_thread_flag(t, TIF_UPROBE); } } return true; } static void mmf_recalc_uprobes(struct mm_struct *mm) { VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; for_each_vma(vmi, vma) { if (!valid_vma(vma, false)) continue; /* * This is not strictly accurate, we can race with * uprobe_unregister() and see the already removed * uprobe if delete_uprobe() was not yet called. * Or this uprobe can be filtered out. */ if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end)) return; } clear_bit(MMF_HAS_UPROBES, &mm->flags); } static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) { struct page *page; uprobe_opcode_t opcode; int result; if (WARN_ON_ONCE(!IS_ALIGNED(vaddr, UPROBE_SWBP_INSN_SIZE))) return -EINVAL; pagefault_disable(); result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr); pagefault_enable(); if (likely(result == 0)) goto out; result = get_user_pages(vaddr, 1, FOLL_FORCE, &page); if (result < 0) return result; copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); put_page(page); out: /* This needs to return true for any variant of the trap insn */ return is_trap_insn(&opcode); } static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr) { struct mm_struct *mm = current->mm; struct uprobe *uprobe = NULL; struct vm_area_struct *vma; struct file *vm_file; loff_t offset; unsigned int seq; guard(rcu)(); if (!mmap_lock_speculate_try_begin(mm, &seq)) return NULL; vma = vma_lookup(mm, bp_vaddr); if (!vma) return NULL; /* * vm_file memory can be reused for another instance of struct file, * but can't be freed from under us, so it's safe to read fields from * it, even if the values are some garbage values; ultimately * find_uprobe_rcu() + mmap_lock_speculation_end() check will ensure * that whatever we speculatively found is correct */ vm_file = READ_ONCE(vma->vm_file); if (!vm_file) return NULL; offset = (loff_t)(vma->vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vma->vm_start); uprobe = find_uprobe_rcu(vm_file->f_inode, offset); if (!uprobe) return NULL; /* now double check that nothing about MM changed */ if (mmap_lock_speculate_retry(mm, seq)) return NULL; return uprobe; } /* assumes being inside RCU protected region */ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp) { struct mm_struct *mm = current->mm; struct uprobe *uprobe = NULL; struct vm_area_struct *vma; uprobe = find_active_uprobe_speculative(bp_vaddr); if (uprobe) return uprobe; mmap_read_lock(mm); vma = vma_lookup(mm, bp_vaddr); if (vma) { if (vma->vm_file) { struct inode *inode = file_inode(vma->vm_file); loff_t offset = vaddr_to_offset(vma, bp_vaddr); uprobe = find_uprobe_rcu(inode, offset); } if (!uprobe) *is_swbp = is_trap_at_addr(mm, bp_vaddr); } else { *is_swbp = -EFAULT; } if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags)) mmf_recalc_uprobes(mm); mmap_read_unlock(mm); return uprobe; } static struct return_instance *push_consumer(struct return_instance *ri, __u64 id, __u64 cookie) { struct return_consumer *ric; if (unlikely(ri == ZERO_SIZE_PTR)) return ri; if (unlikely(ri->cons_cnt > 0)) { ric = krealloc(ri->extra_consumers, sizeof(*ric) * ri->cons_cnt, GFP_KERNEL); if (!ric) { ri_free(ri); return ZERO_SIZE_PTR; } ri->extra_consumers = ric; } ric = likely(ri->cons_cnt == 0) ? &ri->consumer : &ri->extra_consumers[ri->cons_cnt - 1]; ric->id = id; ric->cookie = cookie; ri->cons_cnt++; return ri; } static struct return_consumer * return_consumer_find(struct return_instance *ri, int *iter, int id) { struct return_consumer *ric; int idx; for (idx = *iter; idx < ri->cons_cnt; idx++) { ric = likely(idx == 0) ? &ri->consumer : &ri->extra_consumers[idx - 1]; if (ric->id == id) { *iter = idx + 1; return ric; } } return NULL; } static bool ignore_ret_handler(int rc) { return rc == UPROBE_HANDLER_REMOVE || rc == UPROBE_HANDLER_IGNORE; } static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) { struct uprobe_consumer *uc; bool has_consumers = false, remove = true; struct return_instance *ri = NULL; struct uprobe_task *utask = current->utask; utask->auprobe = &uprobe->arch; list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { bool session = uc->handler && uc->ret_handler; __u64 cookie = 0; int rc = 0; if (uc->handler) { rc = uc->handler(uc, regs, &cookie); WARN(rc < 0 || rc > 2, "bad rc=0x%x from %ps()\n", rc, uc->handler); } remove &= rc == UPROBE_HANDLER_REMOVE; has_consumers = true; if (!uc->ret_handler || ignore_ret_handler(rc)) continue; if (!ri) ri = alloc_return_instance(utask); if (session) ri = push_consumer(ri, uc->id, cookie); } utask->auprobe = NULL; if (!ZERO_OR_NULL_PTR(ri)) prepare_uretprobe(uprobe, regs, ri); if (remove && has_consumers) { down_read(&uprobe->register_rwsem); /* re-check that removal is still required, this time under lock */ if (!filter_chain(uprobe, current->mm)) { WARN_ON(!uprobe_is_active(uprobe)); unapply_uprobe(uprobe, current->mm); } up_read(&uprobe->register_rwsem); } } static void handle_uretprobe_chain(struct return_instance *ri, struct uprobe *uprobe, struct pt_regs *regs) { struct return_consumer *ric; struct uprobe_consumer *uc; int ric_idx = 0; /* all consumers unsubscribed meanwhile */ if (unlikely(!uprobe)) return; rcu_read_lock_trace(); list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { bool session = uc->handler && uc->ret_handler; if (uc->ret_handler) { ric = return_consumer_find(ri, &ric_idx, uc->id); if (!session || ric) uc->ret_handler(uc, ri->func, regs, ric ? &ric->cookie : NULL); } } rcu_read_unlock_trace(); } static struct return_instance *find_next_ret_chain(struct return_instance *ri) { bool chained; do { chained = ri->chained; ri = ri->next; /* can't be NULL if chained */ } while (chained); return ri; } void uprobe_handle_trampoline(struct pt_regs *regs) { struct uprobe_task *utask; struct return_instance *ri, *ri_next, *next_chain; struct uprobe *uprobe; enum hprobe_state hstate; bool valid; utask = current->utask; if (!utask) goto sigill; ri = utask->return_instances; if (!ri) goto sigill; do { /* * We should throw out the frames invalidated by longjmp(). * If this chain is valid, then the next one should be alive * or NULL; the latter case means that nobody but ri->func * could hit this trampoline on return. TODO: sigaltstack(). */ next_chain = find_next_ret_chain(ri); valid = !next_chain || arch_uretprobe_is_alive(next_chain, RP_CHECK_RET, regs); instruction_pointer_set(regs, ri->orig_ret_vaddr); do { /* pop current instance from the stack of pending return instances, * as it's not pending anymore: we just fixed up original * instruction pointer in regs and are about to call handlers; * this allows fixup_uretprobe_trampoline_entries() to properly fix up * captured stack traces from uretprobe handlers, in which pending * trampoline addresses on the stack are replaced with correct * original return addresses */ ri_next = ri->next; rcu_assign_pointer(utask->return_instances, ri_next); utask->depth--; uprobe = hprobe_consume(&ri->hprobe, &hstate); if (valid) handle_uretprobe_chain(ri, uprobe, regs); hprobe_finalize(&ri->hprobe, hstate); /* We already took care of hprobe, no need to waste more time on that. */ free_ret_instance(utask, ri, false /* !cleanup_hprobe */); ri = ri_next; } while (ri != next_chain); } while (!valid); return; sigill: uprobe_warn(current, "handle uretprobe, sending SIGILL."); force_sig(SIGILL); } bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) { return false; } bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, struct pt_regs *regs) { return true; } /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. */ static void handle_swbp(struct pt_regs *regs) { struct uprobe *uprobe; unsigned long bp_vaddr; int is_swbp; bp_vaddr = uprobe_get_swbp_addr(regs); if (bp_vaddr == uprobe_get_trampoline_vaddr()) return uprobe_handle_trampoline(regs); rcu_read_lock_trace(); uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); if (!uprobe) { if (is_swbp > 0) { /* No matching uprobe; signal SIGTRAP. */ force_sig(SIGTRAP); } else { /* * Either we raced with uprobe_unregister() or we can't * access this memory. The latter is only possible if * another thread plays with our ->mm. In both cases * we can simply restart. If this vma was unmapped we * can pretend this insn was not executed yet and get * the (correct) SIGSEGV after restart. */ instruction_pointer_set(regs, bp_vaddr); } goto out; } /* change it in advance for ->handler() and restart */ instruction_pointer_set(regs, bp_vaddr); /* * TODO: move copy_insn/etc into _register and remove this hack. * After we hit the bp, _unregister + _register can install the * new and not-yet-analyzed uprobe at the same address, restart. */ if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) goto out; /* * Pairs with the smp_wmb() in prepare_uprobe(). * * Guarantees that if we see the UPROBE_COPY_INSN bit set, then * we must also see the stores to &uprobe->arch performed by the * prepare_uprobe() call. */ smp_rmb(); /* Tracing handlers use ->utask to communicate with fetch methods */ if (!get_utask()) goto out; if (arch_uprobe_ignore(&uprobe->arch, regs)) goto out; handler_chain(uprobe, regs); if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) goto out; if (pre_ssout(uprobe, regs, bp_vaddr)) goto out; out: /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */ rcu_read_unlock_trace(); } /* * Perform required fix-ups and disable singlestep. * Allow pending signals to take effect. */ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) { struct uprobe *uprobe; int err = 0; uprobe = utask->active_uprobe; if (utask->state == UTASK_SSTEP_ACK) err = arch_uprobe_post_xol(&uprobe->arch, regs); else if (utask->state == UTASK_SSTEP_TRAPPED) arch_uprobe_abort_xol(&uprobe->arch, regs); else WARN_ON_ONCE(1); put_uprobe(uprobe); utask->active_uprobe = NULL; utask->state = UTASK_RUNNING; xol_free_insn_slot(utask); if (utask->signal_denied) { set_thread_flag(TIF_SIGPENDING); utask->signal_denied = false; } if (unlikely(err)) { uprobe_warn(current, "execute the probed insn, sending SIGILL."); force_sig(SIGILL); } } /* * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and * allows the thread to return from interrupt. After that handle_swbp() * sets utask->active_uprobe. * * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag * and allows the thread to return from interrupt. * * While returning to userspace, thread notices the TIF_UPROBE flag and calls * uprobe_notify_resume(). */ void uprobe_notify_resume(struct pt_regs *regs) { struct uprobe_task *utask; clear_thread_flag(TIF_UPROBE); utask = current->utask; if (utask && utask->active_uprobe) handle_singlestep(utask, regs); else handle_swbp(regs); } /* * uprobe_pre_sstep_notifier gets called from interrupt context as part of * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit. */ int uprobe_pre_sstep_notifier(struct pt_regs *regs) { if (!current->mm) return 0; if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) && (!current->utask || !current->utask->return_instances)) return 0; set_thread_flag(TIF_UPROBE); return 1; } /* * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep. */ int uprobe_post_sstep_notifier(struct pt_regs *regs) { struct uprobe_task *utask = current->utask; if (!current->mm || !utask || !utask->active_uprobe) /* task is currently not uprobed */ return 0; utask->state = UTASK_SSTEP_ACK; set_thread_flag(TIF_UPROBE); return 1; } static struct notifier_block uprobe_exception_nb = { .notifier_call = arch_uprobe_exception_notify, .priority = INT_MAX-1, /* notified after kprobes, kgdb */ }; void __init uprobes_init(void) { int i; for (i = 0; i < UPROBES_HASH_SZ; i++) mutex_init(&uprobes_mmap_mutex[i]); BUG_ON(register_die_notifier(&uprobe_exception_nb)); }
3 3 2 3 17 17 8 7 6 6 1 5 2 4 3 3 3 3 3 2 2 2 2 4 17 17 6 6 5 5 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 /* BNEP implementation for Linux Bluetooth stack (BlueZ). Copyright (C) 2001-2002 Inventel Systemes Written 2001-2002 by David Libault <david.libault@inventel.fr> Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ #include <linux/compat.h> #include <linux/export.h> #include <linux/file.h> #include "bnep.h" static struct bt_sock_list bnep_sk_list = { .lock = __RW_LOCK_UNLOCKED(bnep_sk_list.lock) }; static int bnep_sock_release(struct socket *sock) { struct sock *sk = sock->sk; BT_DBG("sock %p sk %p", sock, sk); if (!sk) return 0; bt_sock_unlink(&bnep_sk_list, sk); sock_orphan(sk); sock_put(sk); return 0; } static int do_bnep_sock_ioctl(struct socket *sock, unsigned int cmd, void __user *argp) { struct bnep_connlist_req cl; struct bnep_connadd_req ca; struct bnep_conndel_req cd; struct bnep_conninfo ci; struct socket *nsock; __u32 supp_feat = BIT(BNEP_SETUP_RESPONSE); int err; BT_DBG("cmd %x arg %p", cmd, argp); switch (cmd) { case BNEPCONNADD: if (!capable(CAP_NET_ADMIN)) return -EPERM; if (copy_from_user(&ca, argp, sizeof(ca))) return -EFAULT; nsock = sockfd_lookup(ca.sock, &err); if (!nsock) return err; if (nsock->sk->sk_state != BT_CONNECTED) { sockfd_put(nsock); return -EBADFD; } ca.device[sizeof(ca.device)-1] = 0; err = bnep_add_connection(&ca, nsock); if (!err) { if (copy_to_user(argp, &ca, sizeof(ca))) err = -EFAULT; } else sockfd_put(nsock); return err; case BNEPCONNDEL: if (!capable(CAP_NET_ADMIN)) return -EPERM; if (copy_from_user(&cd, argp, sizeof(cd))) return -EFAULT; return bnep_del_connection(&cd); case BNEPGETCONNLIST: if (copy_from_user(&cl, argp, sizeof(cl))) return -EFAULT; if (cl.cnum <= 0) return -EINVAL; err = bnep_get_connlist(&cl); if (!err && copy_to_user(argp, &cl, sizeof(cl))) return -EFAULT; return err; case BNEPGETCONNINFO: if (copy_from_user(&ci, argp, sizeof(ci))) return -EFAULT; err = bnep_get_conninfo(&ci); if (!err && copy_to_user(argp, &ci, sizeof(ci))) return -EFAULT; return err; case BNEPGETSUPPFEAT: if (copy_to_user(argp, &supp_feat, sizeof(supp_feat))) return -EFAULT; return 0; default: return -EINVAL; } return 0; } static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { return do_bnep_sock_ioctl(sock, cmd, (void __user *)arg); } #ifdef CONFIG_COMPAT static int bnep_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = compat_ptr(arg); if (cmd == BNEPGETCONNLIST) { struct bnep_connlist_req cl; unsigned __user *p = argp; u32 uci; int err; if (get_user(cl.cnum, p) || get_user(uci, p + 1)) return -EFAULT; cl.ci = compat_ptr(uci); if (cl.cnum <= 0) return -EINVAL; err = bnep_get_connlist(&cl); if (!err && put_user(cl.cnum, p)) err = -EFAULT; return err; } return do_bnep_sock_ioctl(sock, cmd, argp); } #endif static const struct proto_ops bnep_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = bnep_sock_release, .ioctl = bnep_sock_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = bnep_sock_compat_ioctl, #endif .bind = sock_no_bind, .getname = sock_no_getname, .sendmsg = sock_no_sendmsg, .recvmsg = sock_no_recvmsg, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .mmap = sock_no_mmap }; static struct proto bnep_proto = { .name = "BNEP", .owner = THIS_MODULE, .obj_size = sizeof(struct bt_sock) }; static int bnep_sock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; BT_DBG("sock %p", sock); if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; sk = bt_sock_alloc(net, sock, &bnep_proto, protocol, GFP_ATOMIC, kern); if (!sk) return -ENOMEM; sock->ops = &bnep_sock_ops; sock->state = SS_UNCONNECTED; bt_sock_link(&bnep_sk_list, sk); return 0; } static const struct net_proto_family bnep_sock_family_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .create = bnep_sock_create }; int __init bnep_sock_init(void) { int err; err = proto_register(&bnep_proto, 0); if (err < 0) return err; err = bt_sock_register(BTPROTO_BNEP, &bnep_sock_family_ops); if (err < 0) { BT_ERR("Can't register BNEP socket"); goto error; } err = bt_procfs_init(&init_net, "bnep", &bnep_sk_list, NULL); if (err < 0) { BT_ERR("Failed to create BNEP proc file"); bt_sock_unregister(BTPROTO_BNEP); goto error; } BT_INFO("BNEP socket layer initialized"); return 0; error: proto_unregister(&bnep_proto); return err; } void __exit bnep_sock_cleanup(void) { bt_procfs_cleanup(&init_net, "bnep"); bt_sock_unregister(BTPROTO_BNEP); proto_unregister(&bnep_proto); }
21852 25 18839 709 3388 26 2243 186 1181 4294 13026 3606 10600 17476 16057 217 374 3727 721 3734 1 170 105 12029 22 85 1035 2 416 24 4393 10319 5 6337 10000 45 825 3162 905 19 9144 50 5059 260 1 2 5 1 37 489 526 527 9874 2 7 15 11 209 33 5566 154 137 2 10433 13 8 152 15 95 177 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 // SPDX-License-Identifier: GPL-2.0 // Generated by scripts/atomic/gen-atomic-instrumented.sh // DO NOT MODIFY THIS FILE DIRECTLY /* * This file provoides atomic operations with explicit instrumentation (e.g. * KASAN, KCSAN), which should be used unless it is necessary to avoid * instrumentation. Where it is necessary to aovid instrumenation, the * raw_atomic*() operations should be used. */ #ifndef _LINUX_ATOMIC_INSTRUMENTED_H #define _LINUX_ATOMIC_INSTRUMENTED_H #include <linux/build_bug.h> #include <linux/compiler.h> #include <linux/instrumented.h> /** * atomic_read() - atomic load with relaxed ordering * @v: pointer to atomic_t * * Atomically loads the value of @v with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_read() there. * * Return: The value loaded from @v. */ static __always_inline int atomic_read(const atomic_t *v) { instrument_atomic_read(v, sizeof(*v)); return raw_atomic_read(v); } /** * atomic_read_acquire() - atomic load with acquire ordering * @v: pointer to atomic_t * * Atomically loads the value of @v with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_read_acquire() there. * * Return: The value loaded from @v. */ static __always_inline int atomic_read_acquire(const atomic_t *v) { instrument_atomic_read(v, sizeof(*v)); return raw_atomic_read_acquire(v); } /** * atomic_set() - atomic set with relaxed ordering * @v: pointer to atomic_t * @i: int value to assign * * Atomically sets @v to @i with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_set() there. * * Return: Nothing. */ static __always_inline void atomic_set(atomic_t *v, int i) { instrument_atomic_write(v, sizeof(*v)); raw_atomic_set(v, i); } /** * atomic_set_release() - atomic set with release ordering * @v: pointer to atomic_t * @i: int value to assign * * Atomically sets @v to @i with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_set_release() there. * * Return: Nothing. */ static __always_inline void atomic_set_release(atomic_t *v, int i) { kcsan_release(); instrument_atomic_write(v, sizeof(*v)); raw_atomic_set_release(v, i); } /** * atomic_add() - atomic add with relaxed ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_add() there. * * Return: Nothing. */ static __always_inline void atomic_add(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_add(i, v); } /** * atomic_add_return() - atomic add with full ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_add_return() there. * * Return: The updated value of @v. */ static __always_inline int atomic_add_return(int i, atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_add_return(i, v); } /** * atomic_add_return_acquire() - atomic add with acquire ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_add_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline int atomic_add_return_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_add_return_acquire(i, v); } /** * atomic_add_return_release() - atomic add with release ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_add_return_release() there. * * Return: The updated value of @v. */ static __always_inline int atomic_add_return_release(int i, atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_add_return_release(i, v); } /** * atomic_add_return_relaxed() - atomic add with relaxed ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_add_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline int atomic_add_return_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_add_return_relaxed(i, v); } /** * atomic_fetch_add() - atomic add with full ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_add() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_add(int i, atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_add(i, v); } /** * atomic_fetch_add_acquire() - atomic add with acquire ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_add_acquire() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_add_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_add_acquire(i, v); } /** * atomic_fetch_add_release() - atomic add with release ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_add_release() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_add_release(int i, atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_add_release(i, v); } /** * atomic_fetch_add_relaxed() - atomic add with relaxed ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_add_relaxed() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_add_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_add_relaxed(i, v); } /** * atomic_sub() - atomic subtract with relaxed ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_sub() there. * * Return: Nothing. */ static __always_inline void atomic_sub(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_sub(i, v); } /** * atomic_sub_return() - atomic subtract with full ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_sub_return() there. * * Return: The updated value of @v. */ static __always_inline int atomic_sub_return(int i, atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_sub_return(i, v); } /** * atomic_sub_return_acquire() - atomic subtract with acquire ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_sub_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline int atomic_sub_return_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_sub_return_acquire(i, v); } /** * atomic_sub_return_release() - atomic subtract with release ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_sub_return_release() there. * * Return: The updated value of @v. */ static __always_inline int atomic_sub_return_release(int i, atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_sub_return_release(i, v); } /** * atomic_sub_return_relaxed() - atomic subtract with relaxed ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_sub_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline int atomic_sub_return_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_sub_return_relaxed(i, v); } /** * atomic_fetch_sub() - atomic subtract with full ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_sub() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_sub(int i, atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_sub(i, v); } /** * atomic_fetch_sub_acquire() - atomic subtract with acquire ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_sub_acquire() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_sub_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_sub_acquire(i, v); } /** * atomic_fetch_sub_release() - atomic subtract with release ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_sub_release() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_sub_release(int i, atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_sub_release(i, v); } /** * atomic_fetch_sub_relaxed() - atomic subtract with relaxed ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_sub_relaxed() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_sub_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_sub_relaxed(i, v); } /** * atomic_inc() - atomic increment with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_inc() there. * * Return: Nothing. */ static __always_inline void atomic_inc(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_inc(v); } /** * atomic_inc_return() - atomic increment with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_inc_return() there. * * Return: The updated value of @v. */ static __always_inline int atomic_inc_return(atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_inc_return(v); } /** * atomic_inc_return_acquire() - atomic increment with acquire ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_inc_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline int atomic_inc_return_acquire(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_inc_return_acquire(v); } /** * atomic_inc_return_release() - atomic increment with release ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_inc_return_release() there. * * Return: The updated value of @v. */ static __always_inline int atomic_inc_return_release(atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_inc_return_release(v); } /** * atomic_inc_return_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_inc_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline int atomic_inc_return_relaxed(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_inc_return_relaxed(v); } /** * atomic_fetch_inc() - atomic increment with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_inc() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_inc(atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_inc(v); } /** * atomic_fetch_inc_acquire() - atomic increment with acquire ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_inc_acquire() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_inc_acquire(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_inc_acquire(v); } /** * atomic_fetch_inc_release() - atomic increment with release ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_inc_release() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_inc_release(atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_inc_release(v); } /** * atomic_fetch_inc_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_inc_relaxed() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_inc_relaxed(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_inc_relaxed(v); } /** * atomic_dec() - atomic decrement with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_dec() there. * * Return: Nothing. */ static __always_inline void atomic_dec(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_dec(v); } /** * atomic_dec_return() - atomic decrement with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_dec_return() there. * * Return: The updated value of @v. */ static __always_inline int atomic_dec_return(atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_dec_return(v); } /** * atomic_dec_return_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_dec_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline int atomic_dec_return_acquire(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_dec_return_acquire(v); } /** * atomic_dec_return_release() - atomic decrement with release ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_dec_return_release() there. * * Return: The updated value of @v. */ static __always_inline int atomic_dec_return_release(atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_dec_return_release(v); } /** * atomic_dec_return_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_dec_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline int atomic_dec_return_relaxed(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_dec_return_relaxed(v); } /** * atomic_fetch_dec() - atomic decrement with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_dec() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_dec(atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_dec(v); } /** * atomic_fetch_dec_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_dec_acquire() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_dec_acquire(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_dec_acquire(v); } /** * atomic_fetch_dec_release() - atomic decrement with release ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_dec_release() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_dec_release(atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_dec_release(v); } /** * atomic_fetch_dec_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_dec_relaxed() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_dec_relaxed(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_dec_relaxed(v); } /** * atomic_and() - atomic bitwise AND with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_and() there. * * Return: Nothing. */ static __always_inline void atomic_and(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_and(i, v); } /** * atomic_fetch_and() - atomic bitwise AND with full ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_and() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_and(int i, atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_and(i, v); } /** * atomic_fetch_and_acquire() - atomic bitwise AND with acquire ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_and_acquire() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_and_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_and_acquire(i, v); } /** * atomic_fetch_and_release() - atomic bitwise AND with release ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_and_release() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_and_release(int i, atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_and_release(i, v); } /** * atomic_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_and_relaxed() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_and_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_and_relaxed(i, v); } /** * atomic_andnot() - atomic bitwise AND NOT with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_andnot() there. * * Return: Nothing. */ static __always_inline void atomic_andnot(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_andnot(i, v); } /** * atomic_fetch_andnot() - atomic bitwise AND NOT with full ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_andnot(int i, atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_andnot(i, v); } /** * atomic_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot_acquire() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_andnot_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_andnot_acquire(i, v); } /** * atomic_fetch_andnot_release() - atomic bitwise AND NOT with release ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot_release() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_andnot_release(int i, atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_andnot_release(i, v); } /** * atomic_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot_relaxed() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_andnot_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_andnot_relaxed(i, v); } /** * atomic_or() - atomic bitwise OR with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_or() there. * * Return: Nothing. */ static __always_inline void atomic_or(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_or(i, v); } /** * atomic_fetch_or() - atomic bitwise OR with full ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_or() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_or(int i, atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_or(i, v); } /** * atomic_fetch_or_acquire() - atomic bitwise OR with acquire ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_or_acquire() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_or_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_or_acquire(i, v); } /** * atomic_fetch_or_release() - atomic bitwise OR with release ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_or_release() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_or_release(int i, atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_or_release(i, v); } /** * atomic_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_or_relaxed() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_or_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_or_relaxed(i, v); } /** * atomic_xor() - atomic bitwise XOR with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_xor() there. * * Return: Nothing. */ static __always_inline void atomic_xor(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_xor(i, v); } /** * atomic_fetch_xor() - atomic bitwise XOR with full ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_xor() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_xor(int i, atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_xor(i, v); } /** * atomic_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_xor_acquire() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_xor_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_xor_acquire(i, v); } /** * atomic_fetch_xor_release() - atomic bitwise XOR with release ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_xor_release() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_xor_release(int i, atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_xor_release(i, v); } /** * atomic_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_xor_relaxed() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_xor_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_xor_relaxed(i, v); } /** * atomic_xchg() - atomic exchange with full ordering * @v: pointer to atomic_t * @new: int value to assign * * Atomically updates @v to @new with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_xchg() there. * * Return: The original value of @v. */ static __always_inline int atomic_xchg(atomic_t *v, int new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_xchg(v, new); } /** * atomic_xchg_acquire() - atomic exchange with acquire ordering * @v: pointer to atomic_t * @new: int value to assign * * Atomically updates @v to @new with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_xchg_acquire() there. * * Return: The original value of @v. */ static __always_inline int atomic_xchg_acquire(atomic_t *v, int new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_xchg_acquire(v, new); } /** * atomic_xchg_release() - atomic exchange with release ordering * @v: pointer to atomic_t * @new: int value to assign * * Atomically updates @v to @new with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_xchg_release() there. * * Return: The original value of @v. */ static __always_inline int atomic_xchg_release(atomic_t *v, int new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_xchg_release(v, new); } /** * atomic_xchg_relaxed() - atomic exchange with relaxed ordering * @v: pointer to atomic_t * @new: int value to assign * * Atomically updates @v to @new with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_xchg_relaxed() there. * * Return: The original value of @v. */ static __always_inline int atomic_xchg_relaxed(atomic_t *v, int new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_xchg_relaxed(v, new); } /** * atomic_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic_t * @old: int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_cmpxchg() there. * * Return: The original value of @v. */ static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_cmpxchg(v, old, new); } /** * atomic_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic_t * @old: int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_cmpxchg_acquire() there. * * Return: The original value of @v. */ static __always_inline int atomic_cmpxchg_acquire(atomic_t *v, int old, int new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_cmpxchg_acquire(v, old, new); } /** * atomic_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic_t * @old: int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_cmpxchg_release() there. * * Return: The original value of @v. */ static __always_inline int atomic_cmpxchg_release(atomic_t *v, int old, int new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_cmpxchg_release(v, old, new); } /** * atomic_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic_t * @old: int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_cmpxchg_relaxed() there. * * Return: The original value of @v. */ static __always_inline int atomic_cmpxchg_relaxed(atomic_t *v, int old, int new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_cmpxchg_relaxed(v, old, new); } /** * atomic_try_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic_t * @old: pointer to int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic_try_cmpxchg(atomic_t *v, int *old, int new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic_try_cmpxchg(v, old, new); } /** * atomic_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic_t * @old: pointer to int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg_acquire() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) { instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic_try_cmpxchg_acquire(v, old, new); } /** * atomic_try_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic_t * @old: pointer to int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg_release() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic_try_cmpxchg_release(v, old, new); } /** * atomic_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic_t * @old: pointer to int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg_relaxed() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) { instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic_try_cmpxchg_relaxed(v, old, new); } /** * atomic_sub_and_test() - atomic subtract and test if zero with full ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_sub_and_test() there. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool atomic_sub_and_test(int i, atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_sub_and_test(i, v); } /** * atomic_dec_and_test() - atomic decrement and test if zero with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_dec_and_test() there. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool atomic_dec_and_test(atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_dec_and_test(v); } /** * atomic_inc_and_test() - atomic increment and test if zero with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_inc_and_test() there. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool atomic_inc_and_test(atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_inc_and_test(v); } /** * atomic_add_negative() - atomic add and test if negative with full ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_add_negative() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic_add_negative(int i, atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_add_negative(i, v); } /** * atomic_add_negative_acquire() - atomic add and test if negative with acquire ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_add_negative_acquire() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic_add_negative_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_add_negative_acquire(i, v); } /** * atomic_add_negative_release() - atomic add and test if negative with release ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_add_negative_release() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic_add_negative_release(int i, atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_add_negative_release(i, v); } /** * atomic_add_negative_relaxed() - atomic add and test if negative with relaxed ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_add_negative_relaxed() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic_add_negative_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_add_negative_relaxed(i, v); } /** * atomic_fetch_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic_t * @a: int value to add * @u: int value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_fetch_add_unless() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_add_unless(v, a, u); } /** * atomic_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic_t * @a: int value to add * @u: int value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_add_unless() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic_add_unless(atomic_t *v, int a, int u) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_add_unless(v, a, u); } /** * atomic_inc_not_zero() - atomic increment unless zero with full ordering * @v: pointer to atomic_t * * If (@v != 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_inc_not_zero() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic_inc_not_zero(atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_inc_not_zero(v); } /** * atomic_inc_unless_negative() - atomic increment unless negative with full ordering * @v: pointer to atomic_t * * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_inc_unless_negative() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic_inc_unless_negative(atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_inc_unless_negative(v); } /** * atomic_dec_unless_positive() - atomic decrement unless positive with full ordering * @v: pointer to atomic_t * * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_dec_unless_positive() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic_dec_unless_positive(atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_dec_unless_positive(v); } /** * atomic_dec_if_positive() - atomic decrement if positive with full ordering * @v: pointer to atomic_t * * If (@v > 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_dec_if_positive() there. * * Return: The old value of (@v - 1), regardless of whether @v was updated. */ static __always_inline int atomic_dec_if_positive(atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_dec_if_positive(v); } /** * atomic64_read() - atomic load with relaxed ordering * @v: pointer to atomic64_t * * Atomically loads the value of @v with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_read() there. * * Return: The value loaded from @v. */ static __always_inline s64 atomic64_read(const atomic64_t *v) { instrument_atomic_read(v, sizeof(*v)); return raw_atomic64_read(v); } /** * atomic64_read_acquire() - atomic load with acquire ordering * @v: pointer to atomic64_t * * Atomically loads the value of @v with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_read_acquire() there. * * Return: The value loaded from @v. */ static __always_inline s64 atomic64_read_acquire(const atomic64_t *v) { instrument_atomic_read(v, sizeof(*v)); return raw_atomic64_read_acquire(v); } /** * atomic64_set() - atomic set with relaxed ordering * @v: pointer to atomic64_t * @i: s64 value to assign * * Atomically sets @v to @i with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_set() there. * * Return: Nothing. */ static __always_inline void atomic64_set(atomic64_t *v, s64 i) { instrument_atomic_write(v, sizeof(*v)); raw_atomic64_set(v, i); } /** * atomic64_set_release() - atomic set with release ordering * @v: pointer to atomic64_t * @i: s64 value to assign * * Atomically sets @v to @i with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_set_release() there. * * Return: Nothing. */ static __always_inline void atomic64_set_release(atomic64_t *v, s64 i) { kcsan_release(); instrument_atomic_write(v, sizeof(*v)); raw_atomic64_set_release(v, i); } /** * atomic64_add() - atomic add with relaxed ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_add() there. * * Return: Nothing. */ static __always_inline void atomic64_add(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic64_add(i, v); } /** * atomic64_add_return() - atomic add with full ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_add_return() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_add_return(s64 i, atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_add_return(i, v); } /** * atomic64_add_return_acquire() - atomic add with acquire ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_add_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_add_return_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_add_return_acquire(i, v); } /** * atomic64_add_return_release() - atomic add with release ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_add_return_release() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_add_return_release(s64 i, atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_add_return_release(i, v); } /** * atomic64_add_return_relaxed() - atomic add with relaxed ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_add_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_add_return_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_add_return_relaxed(i, v); } /** * atomic64_fetch_add() - atomic add with full ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_add() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_add(s64 i, atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_add(i, v); } /** * atomic64_fetch_add_acquire() - atomic add with acquire ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_acquire() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_add_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_add_acquire(i, v); } /** * atomic64_fetch_add_release() - atomic add with release ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_release() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_add_release(s64 i, atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_add_release(i, v); } /** * atomic64_fetch_add_relaxed() - atomic add with relaxed ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_relaxed() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_add_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_add_relaxed(i, v); } /** * atomic64_sub() - atomic subtract with relaxed ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_sub() there. * * Return: Nothing. */ static __always_inline void atomic64_sub(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic64_sub(i, v); } /** * atomic64_sub_return() - atomic subtract with full ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_sub_return() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_sub_return(s64 i, atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_sub_return(i, v); } /** * atomic64_sub_return_acquire() - atomic subtract with acquire ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_sub_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_sub_return_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_sub_return_acquire(i, v); } /** * atomic64_sub_return_release() - atomic subtract with release ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_sub_return_release() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_sub_return_release(s64 i, atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_sub_return_release(i, v); } /** * atomic64_sub_return_relaxed() - atomic subtract with relaxed ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_sub_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_sub_return_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_sub_return_relaxed(i, v); } /** * atomic64_fetch_sub() - atomic subtract with full ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_sub(s64 i, atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_sub(i, v); } /** * atomic64_fetch_sub_acquire() - atomic subtract with acquire ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub_acquire() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_sub_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_sub_acquire(i, v); } /** * atomic64_fetch_sub_release() - atomic subtract with release ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub_release() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_sub_release(s64 i, atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_sub_release(i, v); } /** * atomic64_fetch_sub_relaxed() - atomic subtract with relaxed ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub_relaxed() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_sub_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_sub_relaxed(i, v); } /** * atomic64_inc() - atomic increment with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_inc() there. * * Return: Nothing. */ static __always_inline void atomic64_inc(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic64_inc(v); } /** * atomic64_inc_return() - atomic increment with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_inc_return() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_inc_return(atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_inc_return(v); } /** * atomic64_inc_return_acquire() - atomic increment with acquire ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_inc_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_inc_return_acquire(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_inc_return_acquire(v); } /** * atomic64_inc_return_release() - atomic increment with release ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_inc_return_release() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_inc_return_release(atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_inc_return_release(v); } /** * atomic64_inc_return_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_inc_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_inc_return_relaxed(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_inc_return_relaxed(v); } /** * atomic64_fetch_inc() - atomic increment with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_inc(atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_inc(v); } /** * atomic64_fetch_inc_acquire() - atomic increment with acquire ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc_acquire() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_inc_acquire(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_inc_acquire(v); } /** * atomic64_fetch_inc_release() - atomic increment with release ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc_release() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_inc_release(atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_inc_release(v); } /** * atomic64_fetch_inc_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc_relaxed() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_inc_relaxed(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_inc_relaxed(v); } /** * atomic64_dec() - atomic decrement with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_dec() there. * * Return: Nothing. */ static __always_inline void atomic64_dec(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic64_dec(v); } /** * atomic64_dec_return() - atomic decrement with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_dec_return() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_dec_return(atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_dec_return(v); } /** * atomic64_dec_return_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_dec_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_dec_return_acquire(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_dec_return_acquire(v); } /** * atomic64_dec_return_release() - atomic decrement with release ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_dec_return_release() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_dec_return_release(atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_dec_return_release(v); } /** * atomic64_dec_return_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_dec_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_dec_return_relaxed(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_dec_return_relaxed(v); } /** * atomic64_fetch_dec() - atomic decrement with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_dec(atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_dec(v); } /** * atomic64_fetch_dec_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec_acquire() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_dec_acquire(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_dec_acquire(v); } /** * atomic64_fetch_dec_release() - atomic decrement with release ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec_release() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_dec_release(atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_dec_release(v); } /** * atomic64_fetch_dec_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec_relaxed() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_dec_relaxed(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_dec_relaxed(v); } /** * atomic64_and() - atomic bitwise AND with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_and() there. * * Return: Nothing. */ static __always_inline void atomic64_and(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic64_and(i, v); } /** * atomic64_fetch_and() - atomic bitwise AND with full ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_and() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_and(s64 i, atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_and(i, v); } /** * atomic64_fetch_and_acquire() - atomic bitwise AND with acquire ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_and_acquire() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_and_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_and_acquire(i, v); } /** * atomic64_fetch_and_release() - atomic bitwise AND with release ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_and_release() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_and_release(s64 i, atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_and_release(i, v); } /** * atomic64_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_and_relaxed() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_and_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_and_relaxed(i, v); } /** * atomic64_andnot() - atomic bitwise AND NOT with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_andnot() there. * * Return: Nothing. */ static __always_inline void atomic64_andnot(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic64_andnot(i, v); } /** * atomic64_fetch_andnot() - atomic bitwise AND NOT with full ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_andnot(s64 i, atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_andnot(i, v); } /** * atomic64_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot_acquire() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_andnot_acquire(i, v); } /** * atomic64_fetch_andnot_release() - atomic bitwise AND NOT with release ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot_release() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_andnot_release(s64 i, atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_andnot_release(i, v); } /** * atomic64_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot_relaxed() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_andnot_relaxed(i, v); } /** * atomic64_or() - atomic bitwise OR with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_or() there. * * Return: Nothing. */ static __always_inline void atomic64_or(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic64_or(i, v); } /** * atomic64_fetch_or() - atomic bitwise OR with full ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_or() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_or(s64 i, atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_or(i, v); } /** * atomic64_fetch_or_acquire() - atomic bitwise OR with acquire ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_or_acquire() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_or_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_or_acquire(i, v); } /** * atomic64_fetch_or_release() - atomic bitwise OR with release ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_or_release() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_or_release(s64 i, atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_or_release(i, v); } /** * atomic64_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_or_relaxed() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_or_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_or_relaxed(i, v); } /** * atomic64_xor() - atomic bitwise XOR with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_xor() there. * * Return: Nothing. */ static __always_inline void atomic64_xor(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic64_xor(i, v); } /** * atomic64_fetch_xor() - atomic bitwise XOR with full ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_xor(s64 i, atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_xor(i, v); } /** * atomic64_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor_acquire() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_xor_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_xor_acquire(i, v); } /** * atomic64_fetch_xor_release() - atomic bitwise XOR with release ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor_release() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_xor_release(s64 i, atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_xor_release(i, v); } /** * atomic64_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor_relaxed() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_xor_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_xor_relaxed(i, v); } /** * atomic64_xchg() - atomic exchange with full ordering * @v: pointer to atomic64_t * @new: s64 value to assign * * Atomically updates @v to @new with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_xchg() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_xchg(atomic64_t *v, s64 new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_xchg(v, new); } /** * atomic64_xchg_acquire() - atomic exchange with acquire ordering * @v: pointer to atomic64_t * @new: s64 value to assign * * Atomically updates @v to @new with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_xchg_acquire() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_xchg_acquire(atomic64_t *v, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_xchg_acquire(v, new); } /** * atomic64_xchg_release() - atomic exchange with release ordering * @v: pointer to atomic64_t * @new: s64 value to assign * * Atomically updates @v to @new with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_xchg_release() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_xchg_release(atomic64_t *v, s64 new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_xchg_release(v, new); } /** * atomic64_xchg_relaxed() - atomic exchange with relaxed ordering * @v: pointer to atomic64_t * @new: s64 value to assign * * Atomically updates @v to @new with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_xchg_relaxed() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_xchg_relaxed(atomic64_t *v, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_xchg_relaxed(v, new); } /** * atomic64_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic64_t * @old: s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_cmpxchg(v, old, new); } /** * atomic64_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic64_t * @old: s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg_acquire() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_cmpxchg_acquire(v, old, new); } /** * atomic64_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic64_t * @old: s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg_release() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_cmpxchg_release(v, old, new); } /** * atomic64_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic64_t * @old: s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg_relaxed() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_cmpxchg_relaxed(v, old, new); } /** * atomic64_try_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic64_t * @old: pointer to s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic64_try_cmpxchg(v, old, new); } /** * atomic64_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic64_t * @old: pointer to s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg_acquire() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic64_try_cmpxchg_acquire(v, old, new); } /** * atomic64_try_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic64_t * @old: pointer to s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg_release() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic64_try_cmpxchg_release(v, old, new); } /** * atomic64_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic64_t * @old: pointer to s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg_relaxed() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic64_try_cmpxchg_relaxed(v, old, new); } /** * atomic64_sub_and_test() - atomic subtract and test if zero with full ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_sub_and_test() there. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool atomic64_sub_and_test(s64 i, atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_sub_and_test(i, v); } /** * atomic64_dec_and_test() - atomic decrement and test if zero with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_dec_and_test() there. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool atomic64_dec_and_test(atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_dec_and_test(v); } /** * atomic64_inc_and_test() - atomic increment and test if zero with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_inc_and_test() there. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool atomic64_inc_and_test(atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_inc_and_test(v); } /** * atomic64_add_negative() - atomic add and test if negative with full ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_add_negative() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic64_add_negative(s64 i, atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_add_negative(i, v); } /** * atomic64_add_negative_acquire() - atomic add and test if negative with acquire ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_add_negative_acquire() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic64_add_negative_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_add_negative_acquire(i, v); } /** * atomic64_add_negative_release() - atomic add and test if negative with release ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_add_negative_release() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic64_add_negative_release(s64 i, atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_add_negative_release(i, v); } /** * atomic64_add_negative_relaxed() - atomic add and test if negative with relaxed ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_add_negative_relaxed() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic64_add_negative_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_add_negative_relaxed(i, v); } /** * atomic64_fetch_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic64_t * @a: s64 value to add * @u: s64 value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_unless() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_add_unless(v, a, u); } /** * atomic64_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic64_t * @a: s64 value to add * @u: s64 value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_add_unless() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic64_add_unless(atomic64_t *v, s64 a, s64 u) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_add_unless(v, a, u); } /** * atomic64_inc_not_zero() - atomic increment unless zero with full ordering * @v: pointer to atomic64_t * * If (@v != 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_inc_not_zero() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic64_inc_not_zero(atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_inc_not_zero(v); } /** * atomic64_inc_unless_negative() - atomic increment unless negative with full ordering * @v: pointer to atomic64_t * * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_inc_unless_negative() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic64_inc_unless_negative(atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_inc_unless_negative(v); } /** * atomic64_dec_unless_positive() - atomic decrement unless positive with full ordering * @v: pointer to atomic64_t * * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_dec_unless_positive() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic64_dec_unless_positive(atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_dec_unless_positive(v); } /** * atomic64_dec_if_positive() - atomic decrement if positive with full ordering * @v: pointer to atomic64_t * * If (@v > 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_dec_if_positive() there. * * Return: The old value of (@v - 1), regardless of whether @v was updated. */ static __always_inline s64 atomic64_dec_if_positive(atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_dec_if_positive(v); } /** * atomic_long_read() - atomic load with relaxed ordering * @v: pointer to atomic_long_t * * Atomically loads the value of @v with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_read() there. * * Return: The value loaded from @v. */ static __always_inline long atomic_long_read(const atomic_long_t *v) { instrument_atomic_read(v, sizeof(*v)); return raw_atomic_long_read(v); } /** * atomic_long_read_acquire() - atomic load with acquire ordering * @v: pointer to atomic_long_t * * Atomically loads the value of @v with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_read_acquire() there. * * Return: The value loaded from @v. */ static __always_inline long atomic_long_read_acquire(const atomic_long_t *v) { instrument_atomic_read(v, sizeof(*v)); return raw_atomic_long_read_acquire(v); } /** * atomic_long_set() - atomic set with relaxed ordering * @v: pointer to atomic_long_t * @i: long value to assign * * Atomically sets @v to @i with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_set() there. * * Return: Nothing. */ static __always_inline void atomic_long_set(atomic_long_t *v, long i) { instrument_atomic_write(v, sizeof(*v)); raw_atomic_long_set(v, i); } /** * atomic_long_set_release() - atomic set with release ordering * @v: pointer to atomic_long_t * @i: long value to assign * * Atomically sets @v to @i with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_set_release() there. * * Return: Nothing. */ static __always_inline void atomic_long_set_release(atomic_long_t *v, long i) { kcsan_release(); instrument_atomic_write(v, sizeof(*v)); raw_atomic_long_set_release(v, i); } /** * atomic_long_add() - atomic add with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_add() there. * * Return: Nothing. */ static __always_inline void atomic_long_add(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_long_add(i, v); } /** * atomic_long_add_return() - atomic add with full ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_add_return() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_add_return(long i, atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_add_return(i, v); } /** * atomic_long_add_return_acquire() - atomic add with acquire ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_add_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_add_return_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_add_return_acquire(i, v); } /** * atomic_long_add_return_release() - atomic add with release ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_add_return_release() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_add_return_release(long i, atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_add_return_release(i, v); } /** * atomic_long_add_return_relaxed() - atomic add with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_add_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_add_return_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_add_return_relaxed(i, v); } /** * atomic_long_fetch_add() - atomic add with full ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_add(long i, atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_add(i, v); } /** * atomic_long_fetch_add_acquire() - atomic add with acquire ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_acquire() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_add_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_add_acquire(i, v); } /** * atomic_long_fetch_add_release() - atomic add with release ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_release() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_add_release(long i, atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_add_release(i, v); } /** * atomic_long_fetch_add_relaxed() - atomic add with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_relaxed() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_add_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_add_relaxed(i, v); } /** * atomic_long_sub() - atomic subtract with relaxed ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_sub() there. * * Return: Nothing. */ static __always_inline void atomic_long_sub(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_long_sub(i, v); } /** * atomic_long_sub_return() - atomic subtract with full ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_sub_return() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_sub_return(long i, atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_sub_return(i, v); } /** * atomic_long_sub_return_acquire() - atomic subtract with acquire ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_sub_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_sub_return_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_sub_return_acquire(i, v); } /** * atomic_long_sub_return_release() - atomic subtract with release ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_sub_return_release() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_sub_return_release(long i, atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_sub_return_release(i, v); } /** * atomic_long_sub_return_relaxed() - atomic subtract with relaxed ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_sub_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_sub_return_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_sub_return_relaxed(i, v); } /** * atomic_long_fetch_sub() - atomic subtract with full ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_sub(long i, atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_sub(i, v); } /** * atomic_long_fetch_sub_acquire() - atomic subtract with acquire ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub_acquire() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_sub_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_sub_acquire(i, v); } /** * atomic_long_fetch_sub_release() - atomic subtract with release ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub_release() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_sub_release(long i, atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_sub_release(i, v); } /** * atomic_long_fetch_sub_relaxed() - atomic subtract with relaxed ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub_relaxed() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_sub_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_sub_relaxed(i, v); } /** * atomic_long_inc() - atomic increment with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_inc() there. * * Return: Nothing. */ static __always_inline void atomic_long_inc(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_long_inc(v); } /** * atomic_long_inc_return() - atomic increment with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_inc_return() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_inc_return(atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_inc_return(v); } /** * atomic_long_inc_return_acquire() - atomic increment with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_inc_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_inc_return_acquire(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_inc_return_acquire(v); } /** * atomic_long_inc_return_release() - atomic increment with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_inc_return_release() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_inc_return_release(atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_inc_return_release(v); } /** * atomic_long_inc_return_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_inc_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_inc_return_relaxed(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_inc_return_relaxed(v); } /** * atomic_long_fetch_inc() - atomic increment with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_inc(atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_inc(v); } /** * atomic_long_fetch_inc_acquire() - atomic increment with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc_acquire() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_inc_acquire(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_inc_acquire(v); } /** * atomic_long_fetch_inc_release() - atomic increment with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc_release() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_inc_release(atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_inc_release(v); } /** * atomic_long_fetch_inc_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc_relaxed() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_inc_relaxed(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_inc_relaxed(v); } /** * atomic_long_dec() - atomic decrement with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_dec() there. * * Return: Nothing. */ static __always_inline void atomic_long_dec(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_long_dec(v); } /** * atomic_long_dec_return() - atomic decrement with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_dec_return() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_dec_return(atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_dec_return(v); } /** * atomic_long_dec_return_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_dec_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_dec_return_acquire(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_dec_return_acquire(v); } /** * atomic_long_dec_return_release() - atomic decrement with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_dec_return_release() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_dec_return_release(atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_dec_return_release(v); } /** * atomic_long_dec_return_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_dec_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_dec_return_relaxed(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_dec_return_relaxed(v); } /** * atomic_long_fetch_dec() - atomic decrement with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_dec(atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_dec(v); } /** * atomic_long_fetch_dec_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec_acquire() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_dec_acquire(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_dec_acquire(v); } /** * atomic_long_fetch_dec_release() - atomic decrement with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec_release() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_dec_release(atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_dec_release(v); } /** * atomic_long_fetch_dec_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec_relaxed() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_dec_relaxed(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_dec_relaxed(v); } /** * atomic_long_and() - atomic bitwise AND with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_and() there. * * Return: Nothing. */ static __always_inline void atomic_long_and(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_long_and(i, v); } /** * atomic_long_fetch_and() - atomic bitwise AND with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_and(long i, atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_and(i, v); } /** * atomic_long_fetch_and_acquire() - atomic bitwise AND with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and_acquire() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_and_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_and_acquire(i, v); } /** * atomic_long_fetch_and_release() - atomic bitwise AND with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and_release() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_and_release(long i, atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_and_release(i, v); } /** * atomic_long_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and_relaxed() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_and_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_and_relaxed(i, v); } /** * atomic_long_andnot() - atomic bitwise AND NOT with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_andnot() there. * * Return: Nothing. */ static __always_inline void atomic_long_andnot(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_long_andnot(i, v); } /** * atomic_long_fetch_andnot() - atomic bitwise AND NOT with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_andnot(long i, atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_andnot(i, v); } /** * atomic_long_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot_acquire() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_andnot_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_andnot_acquire(i, v); } /** * atomic_long_fetch_andnot_release() - atomic bitwise AND NOT with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot_release() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_andnot_release(long i, atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_andnot_release(i, v); } /** * atomic_long_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot_relaxed() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_andnot_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_andnot_relaxed(i, v); } /** * atomic_long_or() - atomic bitwise OR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_or() there. * * Return: Nothing. */ static __always_inline void atomic_long_or(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_long_or(i, v); } /** * atomic_long_fetch_or() - atomic bitwise OR with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_or(long i, atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_or(i, v); } /** * atomic_long_fetch_or_acquire() - atomic bitwise OR with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or_acquire() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_or_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_or_acquire(i, v); } /** * atomic_long_fetch_or_release() - atomic bitwise OR with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or_release() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_or_release(long i, atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_or_release(i, v); } /** * atomic_long_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or_relaxed() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_or_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_or_relaxed(i, v); } /** * atomic_long_xor() - atomic bitwise XOR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_xor() there. * * Return: Nothing. */ static __always_inline void atomic_long_xor(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_long_xor(i, v); } /** * atomic_long_fetch_xor() - atomic bitwise XOR with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_xor(long i, atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_xor(i, v); } /** * atomic_long_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor_acquire() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_xor_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_xor_acquire(i, v); } /** * atomic_long_fetch_xor_release() - atomic bitwise XOR with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor_release() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_xor_release(long i, atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_xor_release(i, v); } /** * atomic_long_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor_relaxed() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_xor_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_xor_relaxed(i, v); } /** * atomic_long_xchg() - atomic exchange with full ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_xchg() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_xchg(atomic_long_t *v, long new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_xchg(v, new); } /** * atomic_long_xchg_acquire() - atomic exchange with acquire ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_xchg_acquire() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_xchg_acquire(atomic_long_t *v, long new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_xchg_acquire(v, new); } /** * atomic_long_xchg_release() - atomic exchange with release ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_xchg_release() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_xchg_release(atomic_long_t *v, long new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_xchg_release(v, new); } /** * atomic_long_xchg_relaxed() - atomic exchange with relaxed ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_xchg_relaxed() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_xchg_relaxed(atomic_long_t *v, long new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_xchg_relaxed(v, new); } /** * atomic_long_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_cmpxchg(atomic_long_t *v, long old, long new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_cmpxchg(v, old, new); } /** * atomic_long_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg_acquire() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_cmpxchg_acquire(atomic_long_t *v, long old, long new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_cmpxchg_acquire(v, old, new); } /** * atomic_long_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg_release() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_cmpxchg_release(atomic_long_t *v, long old, long new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_cmpxchg_release(v, old, new); } /** * atomic_long_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg_relaxed() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_cmpxchg_relaxed(atomic_long_t *v, long old, long new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_cmpxchg_relaxed(v, old, new); } /** * atomic_long_try_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic_long_try_cmpxchg(v, old, new); } /** * atomic_long_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg_acquire() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new) { instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic_long_try_cmpxchg_acquire(v, old, new); } /** * atomic_long_try_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg_release() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic_long_try_cmpxchg_release(v, old, new); } /** * atomic_long_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg_relaxed() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new) { instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic_long_try_cmpxchg_relaxed(v, old, new); } /** * atomic_long_sub_and_test() - atomic subtract and test if zero with full ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_sub_and_test() there. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool atomic_long_sub_and_test(long i, atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_sub_and_test(i, v); } /** * atomic_long_dec_and_test() - atomic decrement and test if zero with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_dec_and_test() there. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool atomic_long_dec_and_test(atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_dec_and_test(v); } /** * atomic_long_inc_and_test() - atomic increment and test if zero with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_inc_and_test() there. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool atomic_long_inc_and_test(atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_inc_and_test(v); } /** * atomic_long_add_negative() - atomic add and test if negative with full ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_add_negative() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic_long_add_negative(long i, atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_add_negative(i, v); } /** * atomic_long_add_negative_acquire() - atomic add and test if negative with acquire ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_add_negative_acquire() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic_long_add_negative_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_add_negative_acquire(i, v); } /** * atomic_long_add_negative_release() - atomic add and test if negative with release ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_add_negative_release() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic_long_add_negative_release(long i, atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_add_negative_release(i, v); } /** * atomic_long_add_negative_relaxed() - atomic add and test if negative with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_add_negative_relaxed() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic_long_add_negative_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_add_negative_relaxed(i, v); } /** * atomic_long_fetch_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic_long_t * @a: long value to add * @u: long value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_unless() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_add_unless(v, a, u); } /** * atomic_long_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic_long_t * @a: long value to add * @u: long value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_add_unless() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic_long_add_unless(atomic_long_t *v, long a, long u) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_add_unless(v, a, u); } /** * atomic_long_inc_not_zero() - atomic increment unless zero with full ordering * @v: pointer to atomic_long_t * * If (@v != 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_inc_not_zero() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic_long_inc_not_zero(atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_inc_not_zero(v); } /** * atomic_long_inc_unless_negative() - atomic increment unless negative with full ordering * @v: pointer to atomic_long_t * * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_inc_unless_negative() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic_long_inc_unless_negative(atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_inc_unless_negative(v); } /** * atomic_long_dec_unless_positive() - atomic decrement unless positive with full ordering * @v: pointer to atomic_long_t * * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_dec_unless_positive() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic_long_dec_unless_positive(atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_dec_unless_positive(v); } /** * atomic_long_dec_if_positive() - atomic decrement if positive with full ordering * @v: pointer to atomic_long_t * * If (@v > 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_dec_if_positive() there. * * Return: The old value of (@v - 1), regardless of whether @v was updated. */ static __always_inline long atomic_long_dec_if_positive(atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_dec_if_positive(v); } #define xchg(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ kcsan_mb(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_xchg(__ai_ptr, __VA_ARGS__); \ }) #define xchg_acquire(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_xchg_acquire(__ai_ptr, __VA_ARGS__); \ }) #define xchg_release(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ kcsan_release(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_xchg_release(__ai_ptr, __VA_ARGS__); \ }) #define xchg_relaxed(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_xchg_relaxed(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ kcsan_mb(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg_acquire(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg_acquire(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg_release(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ kcsan_release(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg_release(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg_relaxed(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg_relaxed(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg64(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ kcsan_mb(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg64(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg64_acquire(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg64_acquire(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg64_release(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ kcsan_release(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg64_release(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg64_relaxed(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg64_relaxed(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg128(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ kcsan_mb(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg128(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg128_acquire(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg128_acquire(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg128_release(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ kcsan_release(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg128_release(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg128_relaxed(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg128_relaxed(__ai_ptr, __VA_ARGS__); \ }) #define try_cmpxchg(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ kcsan_mb(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg_acquire(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg_release(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ kcsan_release(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg_relaxed(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg64(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ kcsan_mb(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg64(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg64_acquire(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg64_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg64_release(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ kcsan_release(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg64_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg64_relaxed(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg64_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg128(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ kcsan_mb(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg128(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg128_acquire(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg128_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg128_release(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ kcsan_release(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg128_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg128_relaxed(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg128_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define cmpxchg_local(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg_local(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg64_local(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg64_local(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg128_local(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg128_local(__ai_ptr, __VA_ARGS__); \ }) #define sync_cmpxchg(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ kcsan_mb(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_sync_cmpxchg(__ai_ptr, __VA_ARGS__); \ }) #define try_cmpxchg_local(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg64_local(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg64_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg128_local(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg128_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define sync_try_cmpxchg(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ kcsan_mb(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_sync_try_cmpxchg(__ai_ptr, __VA_ARGS__); \ }) #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */ // 8829b337928e9508259079d32581775ececd415b
17 5 17 6 17 3 17 6 17 10 17 2 17 2 34 2 34 3 34 5 34 4 34 2 34 2 33 2 35 33 36 1 1 21 1 17 27 14 27 27 19 27 8 27 24 5 22 4 22 3 22 19 22 18 29 27 27 27 11 16 27 22 22 27 27 6 4 4 6 16 13 12 16 3 2 1 3 22 22 16 4 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 // SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/security.h> #include <linux/fscrypt.h> #include <linux/fileattr.h> #include <linux/export.h> #include <linux/syscalls.h> #include <linux/namei.h> #include "internal.h" /** * fileattr_fill_xflags - initialize fileattr with xflags * @fa: fileattr pointer * @xflags: FS_XFLAG_* flags * * Set ->fsx_xflags, ->fsx_valid and ->flags (translated xflags). All * other fields are zeroed. */ void fileattr_fill_xflags(struct file_kattr *fa, u32 xflags) { memset(fa, 0, sizeof(*fa)); fa->fsx_valid = true; fa->fsx_xflags = xflags; if (fa->fsx_xflags & FS_XFLAG_IMMUTABLE) fa->flags |= FS_IMMUTABLE_FL; if (fa->fsx_xflags & FS_XFLAG_APPEND) fa->flags |= FS_APPEND_FL; if (fa->fsx_xflags & FS_XFLAG_SYNC) fa->flags |= FS_SYNC_FL; if (fa->fsx_xflags & FS_XFLAG_NOATIME) fa->flags |= FS_NOATIME_FL; if (fa->fsx_xflags & FS_XFLAG_NODUMP) fa->flags |= FS_NODUMP_FL; if (fa->fsx_xflags & FS_XFLAG_DAX) fa->flags |= FS_DAX_FL; if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT) fa->flags |= FS_PROJINHERIT_FL; } EXPORT_SYMBOL(fileattr_fill_xflags); /** * fileattr_fill_flags - initialize fileattr with flags * @fa: fileattr pointer * @flags: FS_*_FL flags * * Set ->flags, ->flags_valid and ->fsx_xflags (translated flags). * All other fields are zeroed. */ void fileattr_fill_flags(struct file_kattr *fa, u32 flags) { memset(fa, 0, sizeof(*fa)); fa->flags_valid = true; fa->flags = flags; if (fa->flags & FS_SYNC_FL) fa->fsx_xflags |= FS_XFLAG_SYNC; if (fa->flags & FS_IMMUTABLE_FL) fa->fsx_xflags |= FS_XFLAG_IMMUTABLE; if (fa->flags & FS_APPEND_FL) fa->fsx_xflags |= FS_XFLAG_APPEND; if (fa->flags & FS_NODUMP_FL) fa->fsx_xflags |= FS_XFLAG_NODUMP; if (fa->flags & FS_NOATIME_FL) fa->fsx_xflags |= FS_XFLAG_NOATIME; if (fa->flags & FS_DAX_FL) fa->fsx_xflags |= FS_XFLAG_DAX; if (fa->flags & FS_PROJINHERIT_FL) fa->fsx_xflags |= FS_XFLAG_PROJINHERIT; } EXPORT_SYMBOL(fileattr_fill_flags); /** * vfs_fileattr_get - retrieve miscellaneous file attributes * @dentry: the object to retrieve from * @fa: fileattr pointer * * Call i_op->fileattr_get() callback, if exists. * * Return: 0 on success, or a negative error on failure. */ int vfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); int error; if (!inode->i_op->fileattr_get) return -EOPNOTSUPP; error = security_inode_file_getattr(dentry, fa); if (error) return error; return inode->i_op->fileattr_get(dentry, fa); } EXPORT_SYMBOL(vfs_fileattr_get); static void fileattr_to_file_attr(const struct file_kattr *fa, struct file_attr *fattr) { __u32 mask = FS_XFLAGS_MASK; memset(fattr, 0, sizeof(struct file_attr)); fattr->fa_xflags = fa->fsx_xflags & mask; fattr->fa_extsize = fa->fsx_extsize; fattr->fa_nextents = fa->fsx_nextents; fattr->fa_projid = fa->fsx_projid; fattr->fa_cowextsize = fa->fsx_cowextsize; } /** * copy_fsxattr_to_user - copy fsxattr to userspace. * @fa: fileattr pointer * @ufa: fsxattr user pointer * * Return: 0 on success, or -EFAULT on failure. */ int copy_fsxattr_to_user(const struct file_kattr *fa, struct fsxattr __user *ufa) { struct fsxattr xfa; __u32 mask = FS_XFLAGS_MASK; memset(&xfa, 0, sizeof(xfa)); xfa.fsx_xflags = fa->fsx_xflags & mask; xfa.fsx_extsize = fa->fsx_extsize; xfa.fsx_nextents = fa->fsx_nextents; xfa.fsx_projid = fa->fsx_projid; xfa.fsx_cowextsize = fa->fsx_cowextsize; if (copy_to_user(ufa, &xfa, sizeof(xfa))) return -EFAULT; return 0; } EXPORT_SYMBOL(copy_fsxattr_to_user); static int file_attr_to_fileattr(const struct file_attr *fattr, struct file_kattr *fa) { __u64 mask = FS_XFLAGS_MASK; if (fattr->fa_xflags & ~mask) return -EINVAL; fileattr_fill_xflags(fa, fattr->fa_xflags); fa->fsx_xflags &= ~FS_XFLAG_RDONLY_MASK; fa->fsx_extsize = fattr->fa_extsize; fa->fsx_projid = fattr->fa_projid; fa->fsx_cowextsize = fattr->fa_cowextsize; return 0; } static int copy_fsxattr_from_user(struct file_kattr *fa, struct fsxattr __user *ufa) { struct fsxattr xfa; __u32 mask = FS_XFLAGS_MASK; if (copy_from_user(&xfa, ufa, sizeof(xfa))) return -EFAULT; if (xfa.fsx_xflags & ~mask) return -EOPNOTSUPP; fileattr_fill_xflags(fa, xfa.fsx_xflags); fa->fsx_xflags &= ~FS_XFLAG_RDONLY_MASK; fa->fsx_extsize = xfa.fsx_extsize; fa->fsx_nextents = xfa.fsx_nextents; fa->fsx_projid = xfa.fsx_projid; fa->fsx_cowextsize = xfa.fsx_cowextsize; return 0; } /* * Generic function to check FS_IOC_FSSETXATTR/FS_IOC_SETFLAGS values and reject * any invalid configurations. * * Note: must be called with inode lock held. */ static int fileattr_set_prepare(struct inode *inode, const struct file_kattr *old_ma, struct file_kattr *fa) { int err; /* * The IMMUTABLE and APPEND_ONLY flags can only be changed by * the relevant capability. */ if ((fa->flags ^ old_ma->flags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) && !capable(CAP_LINUX_IMMUTABLE)) return -EPERM; err = fscrypt_prepare_setflags(inode, old_ma->flags, fa->flags); if (err) return err; /* * Project Quota ID state is only allowed to change from within the init * namespace. Enforce that restriction only if we are trying to change * the quota ID state. Everything else is allowed in user namespaces. */ if (current_user_ns() != &init_user_ns) { if (old_ma->fsx_projid != fa->fsx_projid) return -EINVAL; if ((old_ma->fsx_xflags ^ fa->fsx_xflags) & FS_XFLAG_PROJINHERIT) return -EINVAL; } else { /* * Caller is allowed to change the project ID. If it is being * changed, make sure that the new value is valid. */ if (old_ma->fsx_projid != fa->fsx_projid && !projid_valid(make_kprojid(&init_user_ns, fa->fsx_projid))) return -EINVAL; } /* Check extent size hints. */ if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(inode->i_mode)) return -EINVAL; if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) && !S_ISDIR(inode->i_mode)) return -EINVAL; if ((fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) && !S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) return -EINVAL; /* * It is only valid to set the DAX flag on regular files and * directories on filesystems. */ if ((fa->fsx_xflags & FS_XFLAG_DAX) && !(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) return -EINVAL; /* Extent size hints of zero turn off the flags. */ if (fa->fsx_extsize == 0) fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT); if (fa->fsx_cowextsize == 0) fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; return 0; } /** * vfs_fileattr_set - change miscellaneous file attributes * @idmap: idmap of the mount * @dentry: the object to change * @fa: fileattr pointer * * After verifying permissions, call i_op->fileattr_set() callback, if * exists. * * Verifying attributes involves retrieving current attributes with * i_op->fileattr_get(), this also allows initializing attributes that have * not been set by the caller to current values. Inode lock is held * thoughout to prevent racing with another instance. * * Return: 0 on success, or a negative error on failure. */ int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct file_kattr old_ma = {}; int err; if (!inode->i_op->fileattr_set) return -EOPNOTSUPP; if (!inode_owner_or_capable(idmap, inode)) return -EPERM; inode_lock(inode); err = vfs_fileattr_get(dentry, &old_ma); if (!err) { /* initialize missing bits from old_ma */ if (fa->flags_valid) { fa->fsx_xflags |= old_ma.fsx_xflags & ~FS_XFLAG_COMMON; fa->fsx_extsize = old_ma.fsx_extsize; fa->fsx_nextents = old_ma.fsx_nextents; fa->fsx_projid = old_ma.fsx_projid; fa->fsx_cowextsize = old_ma.fsx_cowextsize; } else { fa->flags |= old_ma.flags & ~FS_COMMON_FL; } err = fileattr_set_prepare(inode, &old_ma, fa); if (err) goto out; err = security_inode_file_setattr(dentry, fa); if (err) goto out; err = inode->i_op->fileattr_set(idmap, dentry, fa); if (err) goto out; } out: inode_unlock(inode); return err; } EXPORT_SYMBOL(vfs_fileattr_set); int ioctl_getflags(struct file *file, unsigned int __user *argp) { struct file_kattr fa = { .flags_valid = true }; /* hint only */ int err; err = vfs_fileattr_get(file->f_path.dentry, &fa); if (err == -EOPNOTSUPP) err = -ENOIOCTLCMD; if (!err) err = put_user(fa.flags, argp); return err; } EXPORT_SYMBOL(ioctl_getflags); int ioctl_setflags(struct file *file, unsigned int __user *argp) { struct mnt_idmap *idmap = file_mnt_idmap(file); struct dentry *dentry = file->f_path.dentry; struct file_kattr fa; unsigned int flags; int err; err = get_user(flags, argp); if (!err) { err = mnt_want_write_file(file); if (!err) { fileattr_fill_flags(&fa, flags); err = vfs_fileattr_set(idmap, dentry, &fa); mnt_drop_write_file(file); if (err == -EOPNOTSUPP) err = -ENOIOCTLCMD; } } return err; } EXPORT_SYMBOL(ioctl_setflags); int ioctl_fsgetxattr(struct file *file, void __user *argp) { struct file_kattr fa = { .fsx_valid = true }; /* hint only */ int err; err = vfs_fileattr_get(file->f_path.dentry, &fa); if (err == -EOPNOTSUPP) err = -ENOIOCTLCMD; if (!err) err = copy_fsxattr_to_user(&fa, argp); return err; } EXPORT_SYMBOL(ioctl_fsgetxattr); int ioctl_fssetxattr(struct file *file, void __user *argp) { struct mnt_idmap *idmap = file_mnt_idmap(file); struct dentry *dentry = file->f_path.dentry; struct file_kattr fa; int err; err = copy_fsxattr_from_user(&fa, argp); if (!err) { err = mnt_want_write_file(file); if (!err) { err = vfs_fileattr_set(idmap, dentry, &fa); mnt_drop_write_file(file); if (err == -EOPNOTSUPP) err = -ENOIOCTLCMD; } } return err; } EXPORT_SYMBOL(ioctl_fssetxattr); SYSCALL_DEFINE5(file_getattr, int, dfd, const char __user *, filename, struct file_attr __user *, ufattr, size_t, usize, unsigned int, at_flags) { struct path filepath __free(path_put) = {}; struct filename *name __free(putname) = NULL; unsigned int lookup_flags = 0; struct file_attr fattr; struct file_kattr fa; int error; BUILD_BUG_ON(sizeof(struct file_attr) < FILE_ATTR_SIZE_VER0); BUILD_BUG_ON(sizeof(struct file_attr) != FILE_ATTR_SIZE_LATEST); if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; if (!(at_flags & AT_SYMLINK_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; if (usize > PAGE_SIZE) return -E2BIG; if (usize < FILE_ATTR_SIZE_VER0) return -EINVAL; name = getname_maybe_null(filename, at_flags); if (IS_ERR(name)) return PTR_ERR(name); if (!name && dfd >= 0) { CLASS(fd, f)(dfd); if (fd_empty(f)) return -EBADF; filepath = fd_file(f)->f_path; path_get(&filepath); } else { error = filename_lookup(dfd, name, lookup_flags, &filepath, NULL); if (error) return error; } error = vfs_fileattr_get(filepath.dentry, &fa); if (error) return error; fileattr_to_file_attr(&fa, &fattr); error = copy_struct_to_user(ufattr, usize, &fattr, sizeof(struct file_attr), NULL); return error; } SYSCALL_DEFINE5(file_setattr, int, dfd, const char __user *, filename, struct file_attr __user *, ufattr, size_t, usize, unsigned int, at_flags) { struct path filepath __free(path_put) = {}; struct filename *name __free(putname) = NULL; unsigned int lookup_flags = 0; struct file_attr fattr; struct file_kattr fa; int error; BUILD_BUG_ON(sizeof(struct file_attr) < FILE_ATTR_SIZE_VER0); BUILD_BUG_ON(sizeof(struct file_attr) != FILE_ATTR_SIZE_LATEST); if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; if (!(at_flags & AT_SYMLINK_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; if (usize > PAGE_SIZE) return -E2BIG; if (usize < FILE_ATTR_SIZE_VER0) return -EINVAL; error = copy_struct_from_user(&fattr, sizeof(struct file_attr), ufattr, usize); if (error) return error; error = file_attr_to_fileattr(&fattr, &fa); if (error) return error; name = getname_maybe_null(filename, at_flags); if (IS_ERR(name)) return PTR_ERR(name); if (!name && dfd >= 0) { CLASS(fd, f)(dfd); if (fd_empty(f)) return -EBADF; filepath = fd_file(f)->f_path; path_get(&filepath); } else { error = filename_lookup(dfd, name, lookup_flags, &filepath, NULL); if (error) return error; } error = mnt_want_write(filepath.mnt); if (!error) { error = vfs_fileattr_set(mnt_idmap(filepath.mnt), filepath.dentry, &fa); mnt_drop_write(filepath.mnt); } return error; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 /* SPDX-License-Identifier: GPL-2.0 */ /* * USB PHY defines * * These APIs may be used between USB controllers. USB device drivers * (for either host or peripheral roles) don't use these calls; they * continue to use just usb_device and usb_gadget. */ #ifndef __LINUX_USB_PHY_H #define __LINUX_USB_PHY_H #include <linux/extcon.h> #include <linux/notifier.h> #include <linux/usb.h> #include <uapi/linux/usb/charger.h> enum usb_phy_interface { USBPHY_INTERFACE_MODE_UNKNOWN, USBPHY_INTERFACE_MODE_UTMI, USBPHY_INTERFACE_MODE_UTMIW, USBPHY_INTERFACE_MODE_ULPI, USBPHY_INTERFACE_MODE_SERIAL, USBPHY_INTERFACE_MODE_HSIC, }; enum usb_phy_events { USB_EVENT_NONE, /* no events or cable disconnected */ USB_EVENT_VBUS, /* vbus valid event */ USB_EVENT_ID, /* id was grounded */ USB_EVENT_CHARGER, /* usb dedicated charger */ USB_EVENT_ENUMERATED, /* gadget driver enumerated */ }; /* associate a type with PHY */ enum usb_phy_type { USB_PHY_TYPE_UNDEFINED, USB_PHY_TYPE_USB2, USB_PHY_TYPE_USB3, }; /* OTG defines lots of enumeration states before device reset */ enum usb_otg_state { OTG_STATE_UNDEFINED = 0, /* single-role peripheral, and dual-role default-b */ OTG_STATE_B_IDLE, OTG_STATE_B_SRP_INIT, OTG_STATE_B_PERIPHERAL, /* extra dual-role default-b states */ OTG_STATE_B_WAIT_ACON, OTG_STATE_B_HOST, /* dual-role default-a */ OTG_STATE_A_IDLE, OTG_STATE_A_WAIT_VRISE, OTG_STATE_A_WAIT_BCON, OTG_STATE_A_HOST, OTG_STATE_A_SUSPEND, OTG_STATE_A_PERIPHERAL, OTG_STATE_A_WAIT_VFALL, OTG_STATE_A_VBUS_ERR, }; struct usb_phy; struct usb_otg; /* for phys connected thru an ULPI interface, the user must * provide access ops */ struct usb_phy_io_ops { int (*read)(struct usb_phy *x, u32 reg); int (*write)(struct usb_phy *x, u32 val, u32 reg); }; struct usb_charger_current { unsigned int sdp_min; unsigned int sdp_max; unsigned int dcp_min; unsigned int dcp_max; unsigned int cdp_min; unsigned int cdp_max; unsigned int aca_min; unsigned int aca_max; }; struct usb_phy { struct device *dev; const char *label; unsigned int flags; enum usb_phy_type type; enum usb_phy_events last_event; struct usb_otg *otg; struct device *io_dev; struct usb_phy_io_ops *io_ops; void __iomem *io_priv; /* to support extcon device */ struct extcon_dev *edev; struct extcon_dev *id_edev; struct notifier_block vbus_nb; struct notifier_block id_nb; struct notifier_block type_nb; /* Support USB charger */ enum usb_charger_type chg_type; enum usb_charger_state chg_state; struct usb_charger_current chg_cur; struct work_struct chg_work; /* for notification of usb_phy_events */ struct atomic_notifier_head notifier; /* to pass extra port status to the root hub */ u16 port_status; u16 port_change; /* to support controllers that have multiple phys */ struct list_head head; /* initialize/shutdown the phy */ int (*init)(struct usb_phy *x); void (*shutdown)(struct usb_phy *x); /* enable/disable VBUS */ int (*set_vbus)(struct usb_phy *x, int on); /* effective for B devices, ignored for A-peripheral */ int (*set_power)(struct usb_phy *x, unsigned mA); /* Set phy into suspend mode */ int (*set_suspend)(struct usb_phy *x, int suspend); /* * Set wakeup enable for PHY, in that case, the PHY can be * woken up from suspend status due to external events, * like vbus change, dp/dm change and id. */ int (*set_wakeup)(struct usb_phy *x, bool enabled); /* notify phy connect status change */ int (*notify_connect)(struct usb_phy *x, enum usb_device_speed speed); int (*notify_disconnect)(struct usb_phy *x, enum usb_device_speed speed); /* * Charger detection method can be implemented if you need to * manually detect the charger type. */ enum usb_charger_type (*charger_detect)(struct usb_phy *x); }; /* for board-specific init logic */ extern int usb_add_phy(struct usb_phy *, enum usb_phy_type type); extern int usb_add_phy_dev(struct usb_phy *); extern void usb_remove_phy(struct usb_phy *); /* helpers for direct access thru low-level io interface */ static inline int usb_phy_io_read(struct usb_phy *x, u32 reg) { if (x && x->io_ops && x->io_ops->read) return x->io_ops->read(x, reg); return -EINVAL; } static inline int usb_phy_io_write(struct usb_phy *x, u32 val, u32 reg) { if (x && x->io_ops && x->io_ops->write) return x->io_ops->write(x, val, reg); return -EINVAL; } static inline int usb_phy_init(struct usb_phy *x) { if (x && x->init) return x->init(x); return 0; } static inline void usb_phy_shutdown(struct usb_phy *x) { if (x && x->shutdown) x->shutdown(x); } static inline int usb_phy_vbus_on(struct usb_phy *x) { if (!x || !x->set_vbus) return 0; return x->set_vbus(x, true); } static inline int usb_phy_vbus_off(struct usb_phy *x) { if (!x || !x->set_vbus) return 0; return x->set_vbus(x, false); } /* for usb host and peripheral controller drivers */ #if IS_ENABLED(CONFIG_USB_PHY) extern struct usb_phy *usb_get_phy(enum usb_phy_type type); extern struct usb_phy *devm_usb_get_phy(struct device *dev, enum usb_phy_type type); extern struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev, const char *phandle, u8 index); extern struct usb_phy *devm_usb_get_phy_by_node(struct device *dev, struct device_node *node, struct notifier_block *nb); extern void usb_put_phy(struct usb_phy *); extern void usb_phy_set_event(struct usb_phy *x, unsigned long event); extern void usb_phy_set_charger_current(struct usb_phy *usb_phy, unsigned int mA); extern void usb_phy_get_charger_current(struct usb_phy *usb_phy, unsigned int *min, unsigned int *max); extern void usb_phy_set_charger_state(struct usb_phy *usb_phy, enum usb_charger_state state); #else static inline struct usb_phy *usb_get_phy(enum usb_phy_type type) { return ERR_PTR(-ENXIO); } static inline struct usb_phy *devm_usb_get_phy(struct device *dev, enum usb_phy_type type) { return ERR_PTR(-ENXIO); } static inline struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev, const char *phandle, u8 index) { return ERR_PTR(-ENXIO); } static inline struct usb_phy *devm_usb_get_phy_by_node(struct device *dev, struct device_node *node, struct notifier_block *nb) { return ERR_PTR(-ENXIO); } static inline void usb_put_phy(struct usb_phy *x) { } static inline void usb_phy_set_event(struct usb_phy *x, unsigned long event) { } static inline void usb_phy_set_charger_current(struct usb_phy *usb_phy, unsigned int mA) { } static inline void usb_phy_get_charger_current(struct usb_phy *usb_phy, unsigned int *min, unsigned int *max) { } static inline void usb_phy_set_charger_state(struct usb_phy *usb_phy, enum usb_charger_state state) { } #endif static inline int usb_phy_set_power(struct usb_phy *x, unsigned mA) { if (!x) return 0; usb_phy_set_charger_current(x, mA); if (x->set_power) return x->set_power(x, mA); return 0; } /* Context: can sleep */ static inline int usb_phy_set_suspend(struct usb_phy *x, int suspend) { if (x && x->set_suspend != NULL) return x->set_suspend(x, suspend); else return 0; } static inline int usb_phy_set_wakeup(struct usb_phy *x, bool enabled) { if (x && x->set_wakeup) return x->set_wakeup(x, enabled); else return 0; } static inline int usb_phy_notify_connect(struct usb_phy *x, enum usb_device_speed speed) { if (x && x->notify_connect) return x->notify_connect(x, speed); else return 0; } static inline int usb_phy_notify_disconnect(struct usb_phy *x, enum usb_device_speed speed) { if (x && x->notify_disconnect) return x->notify_disconnect(x, speed); else return 0; } /* notifiers */ static inline int usb_register_notifier(struct usb_phy *x, struct notifier_block *nb) { return atomic_notifier_chain_register(&x->notifier, nb); } static inline void usb_unregister_notifier(struct usb_phy *x, struct notifier_block *nb) { atomic_notifier_chain_unregister(&x->notifier, nb); } static inline const char *usb_phy_type_string(enum usb_phy_type type) { switch (type) { case USB_PHY_TYPE_USB2: return "USB2 PHY"; case USB_PHY_TYPE_USB3: return "USB3 PHY"; default: return "UNKNOWN PHY TYPE"; } } #endif /* __LINUX_USB_PHY_H */
36 18 18 18 18 18 18 18 18 18 18 18 18 36 36 36 36 36 36 36 36 36 18 18 18 18 18 18 18 18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 // SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/module.h> #include <linux/err.h> #include <linux/highmem.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/uaccess.h> #ifdef CONFIG_BLOCK #include <linux/bio.h> #endif #include <linux/ceph/ceph_features.h> #include <linux/ceph/libceph.h> #include <linux/ceph/osd_client.h> #include <linux/ceph/messenger.h> #include <linux/ceph/decode.h> #include <linux/ceph/auth.h> #include <linux/ceph/pagelist.h> #include <linux/ceph/striper.h> #define OSD_OPREPLY_FRONT_LEN 512 static struct kmem_cache *ceph_osd_request_cache; static const struct ceph_connection_operations osd_con_ops; /* * Implement client access to distributed object storage cluster. * * All data objects are stored within a cluster/cloud of OSDs, or * "object storage devices." (Note that Ceph OSDs have _nothing_ to * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply * remote daemons serving up and coordinating consistent and safe * access to storage. * * Cluster membership and the mapping of data objects onto storage devices * are described by the osd map. * * We keep track of pending OSD requests (read, write), resubmit * requests to different OSDs when the cluster topology/data layout * change, or retry the affected requests when the communications * channel with an OSD is reset. */ static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req); static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req); static void link_linger(struct ceph_osd *osd, struct ceph_osd_linger_request *lreq); static void unlink_linger(struct ceph_osd *osd, struct ceph_osd_linger_request *lreq); static void clear_backoffs(struct ceph_osd *osd); #if 1 static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem) { bool wrlocked = true; if (unlikely(down_read_trylock(sem))) { wrlocked = false; up_read(sem); } return wrlocked; } static inline void verify_osdc_locked(struct ceph_osd_client *osdc) { WARN_ON(!rwsem_is_locked(&osdc->lock)); } static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) { WARN_ON(!rwsem_is_wrlocked(&osdc->lock)); } static inline void verify_osd_locked(struct ceph_osd *osd) { struct ceph_osd_client *osdc = osd->o_osdc; WARN_ON(!(mutex_is_locked(&osd->lock) && rwsem_is_locked(&osdc->lock)) && !rwsem_is_wrlocked(&osdc->lock)); } static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq) { WARN_ON(!mutex_is_locked(&lreq->lock)); } #else static inline void verify_osdc_locked(struct ceph_osd_client *osdc) { } static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) { } static inline void verify_osd_locked(struct ceph_osd *osd) { } static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq) { } #endif /* * calculate the mapping of a file extent onto an object, and fill out the * request accordingly. shorten extent as necessary if it crosses an * object boundary. * * fill osd op in request message. */ static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen, u64 *objnum, u64 *objoff, u64 *objlen) { u64 orig_len = *plen; u32 xlen; /* object extent? */ ceph_calc_file_object_mapping(layout, off, orig_len, objnum, objoff, &xlen); *objlen = xlen; if (*objlen < orig_len) { *plen = *objlen; dout(" skipping last %llu, final file extent %llu~%llu\n", orig_len - *plen, off, *plen); } dout("calc_layout objnum=%llx %llu~%llu\n", *objnum, *objoff, *objlen); return 0; } static void ceph_osd_data_init(struct ceph_osd_data *osd_data) { memset(osd_data, 0, sizeof (*osd_data)); osd_data->type = CEPH_OSD_DATA_TYPE_NONE; } /* * Consumes @pages if @own_pages is true. */ static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data, struct page **pages, u64 length, u32 alignment, bool pages_from_pool, bool own_pages) { osd_data->type = CEPH_OSD_DATA_TYPE_PAGES; osd_data->pages = pages; osd_data->length = length; osd_data->alignment = alignment; osd_data->pages_from_pool = pages_from_pool; osd_data->own_pages = own_pages; } /* * Consumes a ref on @pagelist. */ static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data, struct ceph_pagelist *pagelist) { osd_data->type = CEPH_OSD_DATA_TYPE_PAGELIST; osd_data->pagelist = pagelist; } #ifdef CONFIG_BLOCK static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, struct ceph_bio_iter *bio_pos, u32 bio_length) { osd_data->type = CEPH_OSD_DATA_TYPE_BIO; osd_data->bio_pos = *bio_pos; osd_data->bio_length = bio_length; } #endif /* CONFIG_BLOCK */ static void ceph_osd_data_bvecs_init(struct ceph_osd_data *osd_data, struct ceph_bvec_iter *bvec_pos, u32 num_bvecs) { osd_data->type = CEPH_OSD_DATA_TYPE_BVECS; osd_data->bvec_pos = *bvec_pos; osd_data->num_bvecs = num_bvecs; } static void ceph_osd_iter_init(struct ceph_osd_data *osd_data, struct iov_iter *iter) { osd_data->type = CEPH_OSD_DATA_TYPE_ITER; osd_data->iter = *iter; } static struct ceph_osd_data * osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which) { BUG_ON(which >= osd_req->r_num_ops); return &osd_req->r_ops[which].raw_data_in; } struct ceph_osd_data * osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req, unsigned int which) { return osd_req_op_data(osd_req, which, extent, osd_data); } EXPORT_SYMBOL(osd_req_op_extent_osd_data); void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req, unsigned int which, struct page **pages, u64 length, u32 alignment, bool pages_from_pool, bool own_pages) { struct ceph_osd_data *osd_data; osd_data = osd_req_op_raw_data_in(osd_req, which); ceph_osd_data_pages_init(osd_data, pages, length, alignment, pages_from_pool, own_pages); } EXPORT_SYMBOL(osd_req_op_raw_data_in_pages); void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req, unsigned int which, struct page **pages, u64 length, u32 alignment, bool pages_from_pool, bool own_pages) { struct ceph_osd_data *osd_data; osd_data = osd_req_op_data(osd_req, which, extent, osd_data); ceph_osd_data_pages_init(osd_data, pages, length, alignment, pages_from_pool, own_pages); } EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages); #ifdef CONFIG_BLOCK void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, unsigned int which, struct ceph_bio_iter *bio_pos, u32 bio_length) { struct ceph_osd_data *osd_data; osd_data = osd_req_op_data(osd_req, which, extent, osd_data); ceph_osd_data_bio_init(osd_data, bio_pos, bio_length); } EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio); #endif /* CONFIG_BLOCK */ void osd_req_op_extent_osd_data_bvecs(struct ceph_osd_request *osd_req, unsigned int which, struct bio_vec *bvecs, u32 num_bvecs, u32 bytes) { struct ceph_osd_data *osd_data; struct ceph_bvec_iter it = { .bvecs = bvecs, .iter = { .bi_size = bytes }, }; osd_data = osd_req_op_data(osd_req, which, extent, osd_data); ceph_osd_data_bvecs_init(osd_data, &it, num_bvecs); } EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvecs); void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req, unsigned int which, struct ceph_bvec_iter *bvec_pos) { struct ceph_osd_data *osd_data; osd_data = osd_req_op_data(osd_req, which, extent, osd_data); ceph_osd_data_bvecs_init(osd_data, bvec_pos, 0); } EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvec_pos); /** * osd_req_op_extent_osd_iter - Set up an operation with an iterator buffer * @osd_req: The request to set up * @which: Index of the operation in which to set the iter * @iter: The buffer iterator */ void osd_req_op_extent_osd_iter(struct ceph_osd_request *osd_req, unsigned int which, struct iov_iter *iter) { struct ceph_osd_data *osd_data; osd_data = osd_req_op_data(osd_req, which, extent, osd_data); ceph_osd_iter_init(osd_data, iter); } EXPORT_SYMBOL(osd_req_op_extent_osd_iter); static void osd_req_op_cls_request_info_pagelist( struct ceph_osd_request *osd_req, unsigned int which, struct ceph_pagelist *pagelist) { struct ceph_osd_data *osd_data; osd_data = osd_req_op_data(osd_req, which, cls, request_info); ceph_osd_data_pagelist_init(osd_data, pagelist); } void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req, unsigned int which, struct page **pages, u64 length, u32 alignment, bool pages_from_pool, bool own_pages) { struct ceph_osd_data *osd_data; osd_data = osd_req_op_data(osd_req, which, cls, request_data); ceph_osd_data_pages_init(osd_data, pages, length, alignment, pages_from_pool, own_pages); osd_req->r_ops[which].cls.indata_len += length; osd_req->r_ops[which].indata_len += length; } EXPORT_SYMBOL(osd_req_op_cls_request_data_pages); void osd_req_op_cls_request_data_bvecs(struct ceph_osd_request *osd_req, unsigned int which, struct bio_vec *bvecs, u32 num_bvecs, u32 bytes) { struct ceph_osd_data *osd_data; struct ceph_bvec_iter it = { .bvecs = bvecs, .iter = { .bi_size = bytes }, }; osd_data = osd_req_op_data(osd_req, which, cls, request_data); ceph_osd_data_bvecs_init(osd_data, &it, num_bvecs); osd_req->r_ops[which].cls.indata_len += bytes; osd_req->r_ops[which].indata_len += bytes; } EXPORT_SYMBOL(osd_req_op_cls_request_data_bvecs); void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req, unsigned int which, struct page **pages, u64 length, u32 alignment, bool pages_from_pool, bool own_pages) { struct ceph_osd_data *osd_data; osd_data = osd_req_op_data(osd_req, which, cls, response_data); ceph_osd_data_pages_init(osd_data, pages, length, alignment, pages_from_pool, own_pages); } EXPORT_SYMBOL(osd_req_op_cls_response_data_pages); static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data) { switch (osd_data->type) { case CEPH_OSD_DATA_TYPE_NONE: return 0; case CEPH_OSD_DATA_TYPE_PAGES: return osd_data->length; case CEPH_OSD_DATA_TYPE_PAGELIST: return (u64)osd_data->pagelist->length; #ifdef CONFIG_BLOCK case CEPH_OSD_DATA_TYPE_BIO: return (u64)osd_data->bio_length; #endif /* CONFIG_BLOCK */ case CEPH_OSD_DATA_TYPE_BVECS: return osd_data->bvec_pos.iter.bi_size; case CEPH_OSD_DATA_TYPE_ITER: return iov_iter_count(&osd_data->iter); default: WARN(true, "unrecognized data type %d\n", (int)osd_data->type); return 0; } } static void ceph_osd_data_release(struct ceph_osd_data *osd_data) { if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES && osd_data->own_pages) { int num_pages; num_pages = calc_pages_for((u64)osd_data->alignment, (u64)osd_data->length); ceph_release_page_vector(osd_data->pages, num_pages); } else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) { ceph_pagelist_release(osd_data->pagelist); } ceph_osd_data_init(osd_data); } static void osd_req_op_data_release(struct ceph_osd_request *osd_req, unsigned int which) { struct ceph_osd_req_op *op; BUG_ON(which >= osd_req->r_num_ops); op = &osd_req->r_ops[which]; switch (op->op) { case CEPH_OSD_OP_READ: case CEPH_OSD_OP_SPARSE_READ: case CEPH_OSD_OP_WRITE: case CEPH_OSD_OP_WRITEFULL: kfree(op->extent.sparse_ext); ceph_osd_data_release(&op->extent.osd_data); break; case CEPH_OSD_OP_CALL: ceph_osd_data_release(&op->cls.request_info); ceph_osd_data_release(&op->cls.request_data); ceph_osd_data_release(&op->cls.response_data); break; case CEPH_OSD_OP_SETXATTR: case CEPH_OSD_OP_CMPXATTR: ceph_osd_data_release(&op->xattr.osd_data); break; case CEPH_OSD_OP_STAT: ceph_osd_data_release(&op->raw_data_in); break; case CEPH_OSD_OP_NOTIFY_ACK: ceph_osd_data_release(&op->notify_ack.request_data); break; case CEPH_OSD_OP_NOTIFY: ceph_osd_data_release(&op->notify.request_data); ceph_osd_data_release(&op->notify.response_data); break; case CEPH_OSD_OP_LIST_WATCHERS: ceph_osd_data_release(&op->list_watchers.response_data); break; case CEPH_OSD_OP_COPY_FROM2: ceph_osd_data_release(&op->copy_from.osd_data); break; default: break; } } /* * Assumes @t is zero-initialized. */ static void target_init(struct ceph_osd_request_target *t) { ceph_oid_init(&t->base_oid); ceph_oloc_init(&t->base_oloc); ceph_oid_init(&t->target_oid); ceph_oloc_init(&t->target_oloc); ceph_osds_init(&t->acting); ceph_osds_init(&t->up); t->size = -1; t->min_size = -1; t->osd = CEPH_HOMELESS_OSD; } static void target_copy(struct ceph_osd_request_target *dest, const struct ceph_osd_request_target *src) { ceph_oid_copy(&dest->base_oid, &src->base_oid); ceph_oloc_copy(&dest->base_oloc, &src->base_oloc); ceph_oid_copy(&dest->target_oid, &src->target_oid); ceph_oloc_copy(&dest->target_oloc, &src->target_oloc); dest->pgid = src->pgid; /* struct */ dest->spgid = src->spgid; /* struct */ dest->pg_num = src->pg_num; dest->pg_num_mask = src->pg_num_mask; ceph_osds_copy(&dest->acting, &src->acting); ceph_osds_copy(&dest->up, &src->up); dest->size = src->size; dest->min_size = src->min_size; dest->sort_bitwise = src->sort_bitwise; dest->recovery_deletes = src->recovery_deletes; dest->flags = src->flags; dest->used_replica = src->used_replica; dest->paused = src->paused; dest->epoch = src->epoch; dest->last_force_resend = src->last_force_resend; dest->osd = src->osd; } static void target_destroy(struct ceph_osd_request_target *t) { ceph_oid_destroy(&t->base_oid); ceph_oloc_destroy(&t->base_oloc); ceph_oid_destroy(&t->target_oid); ceph_oloc_destroy(&t->target_oloc); } /* * requests */ static void request_release_checks(struct ceph_osd_request *req) { WARN_ON(!RB_EMPTY_NODE(&req->r_node)); WARN_ON(!RB_EMPTY_NODE(&req->r_mc_node)); WARN_ON(!list_empty(&req->r_private_item)); WARN_ON(req->r_osd); } static void ceph_osdc_release_request(struct kref *kref) { struct ceph_osd_request *req = container_of(kref, struct ceph_osd_request, r_kref); unsigned int which; dout("%s %p (r_request %p r_reply %p)\n", __func__, req, req->r_request, req->r_reply); request_release_checks(req); if (req->r_request) ceph_msg_put(req->r_request); if (req->r_reply) ceph_msg_put(req->r_reply); for (which = 0; which < req->r_num_ops; which++) osd_req_op_data_release(req, which); target_destroy(&req->r_t); ceph_put_snap_context(req->r_snapc); if (req->r_mempool) mempool_free(req, req->r_osdc->req_mempool); else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS) kmem_cache_free(ceph_osd_request_cache, req); else kfree(req); } void ceph_osdc_get_request(struct ceph_osd_request *req) { dout("%s %p (was %d)\n", __func__, req, kref_read(&req->r_kref)); kref_get(&req->r_kref); } EXPORT_SYMBOL(ceph_osdc_get_request); void ceph_osdc_put_request(struct ceph_osd_request *req) { if (req) { dout("%s %p (was %d)\n", __func__, req, kref_read(&req->r_kref)); kref_put(&req->r_kref, ceph_osdc_release_request); } } EXPORT_SYMBOL(ceph_osdc_put_request); static void request_init(struct ceph_osd_request *req) { /* req only, each op is zeroed in osd_req_op_init() */ memset(req, 0, sizeof(*req)); kref_init(&req->r_kref); init_completion(&req->r_completion); RB_CLEAR_NODE(&req->r_node); RB_CLEAR_NODE(&req->r_mc_node); INIT_LIST_HEAD(&req->r_private_item); target_init(&req->r_t); } struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, unsigned int num_ops, bool use_mempool, gfp_t gfp_flags) { struct ceph_osd_request *req; if (use_mempool) { BUG_ON(num_ops > CEPH_OSD_SLAB_OPS); req = mempool_alloc(osdc->req_mempool, gfp_flags); } else if (num_ops <= CEPH_OSD_SLAB_OPS) { req = kmem_cache_alloc(ceph_osd_request_cache, gfp_flags); } else { BUG_ON(num_ops > CEPH_OSD_MAX_OPS); req = kmalloc(struct_size(req, r_ops, num_ops), gfp_flags); } if (unlikely(!req)) return NULL; request_init(req); req->r_osdc = osdc; req->r_mempool = use_mempool; req->r_num_ops = num_ops; req->r_snapid = CEPH_NOSNAP; req->r_snapc = ceph_get_snap_context(snapc); dout("%s req %p\n", __func__, req); return req; } EXPORT_SYMBOL(ceph_osdc_alloc_request); static int ceph_oloc_encoding_size(const struct ceph_object_locator *oloc) { return 8 + 4 + 4 + 4 + (oloc->pool_ns ? oloc->pool_ns->len : 0); } static int __ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp, int num_request_data_items, int num_reply_data_items) { struct ceph_osd_client *osdc = req->r_osdc; struct ceph_msg *msg; int msg_size; WARN_ON(req->r_request || req->r_reply); WARN_ON(ceph_oid_empty(&req->r_base_oid)); WARN_ON(ceph_oloc_empty(&req->r_base_oloc)); /* create request message */ msg_size = CEPH_ENCODING_START_BLK_LEN + CEPH_PGID_ENCODING_LEN + 1; /* spgid */ msg_size += 4 + 4 + 4; /* hash, osdmap_epoch, flags */ msg_size += CEPH_ENCODING_START_BLK_LEN + sizeof(struct ceph_osd_reqid); /* reqid */ msg_size += sizeof(struct ceph_blkin_trace_info); /* trace */ msg_size += 4 + sizeof(struct ceph_timespec); /* client_inc, mtime */ msg_size += CEPH_ENCODING_START_BLK_LEN + ceph_oloc_encoding_size(&req->r_base_oloc); /* oloc */ msg_size += 4 + req->r_base_oid.name_len; /* oid */ msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op); msg_size += 8; /* snapid */ msg_size += 8; /* snap_seq */ msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0); msg_size += 4 + 8; /* retry_attempt, features */ if (req->r_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op, msg_size, num_request_data_items); else msg = ceph_msg_new2(CEPH_MSG_OSD_OP, msg_size, num_request_data_items, gfp, true); if (!msg) return -ENOMEM; memset(msg->front.iov_base, 0, msg->front.iov_len); req->r_request = msg; /* create reply message */ msg_size = OSD_OPREPLY_FRONT_LEN; msg_size += req->r_base_oid.name_len; msg_size += req->r_num_ops * sizeof(struct ceph_osd_op); if (req->r_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op_reply, msg_size, num_reply_data_items); else msg = ceph_msg_new2(CEPH_MSG_OSD_OPREPLY, msg_size, num_reply_data_items, gfp, true); if (!msg) return -ENOMEM; req->r_reply = msg; return 0; } static bool osd_req_opcode_valid(u16 opcode) { switch (opcode) { #define GENERATE_CASE(op, opcode, str) case CEPH_OSD_OP_##op: return true; __CEPH_FORALL_OSD_OPS(GENERATE_CASE) #undef GENERATE_CASE default: return false; } } static void get_num_data_items(struct ceph_osd_request *req, int *num_request_data_items, int *num_reply_data_items) { struct ceph_osd_req_op *op; *num_request_data_items = 0; *num_reply_data_items = 0; for (op = req->r_ops; op != &req->r_ops[req->r_num_ops]; op++) { switch (op->op) { /* request */ case CEPH_OSD_OP_WRITE: case CEPH_OSD_OP_WRITEFULL: case CEPH_OSD_OP_SETXATTR: case CEPH_OSD_OP_CMPXATTR: case CEPH_OSD_OP_NOTIFY_ACK: case CEPH_OSD_OP_COPY_FROM2: *num_request_data_items += 1; break; /* reply */ case CEPH_OSD_OP_STAT: case CEPH_OSD_OP_READ: case CEPH_OSD_OP_SPARSE_READ: case CEPH_OSD_OP_LIST_WATCHERS: *num_reply_data_items += 1; break; /* both */ case CEPH_OSD_OP_NOTIFY: *num_request_data_items += 1; *num_reply_data_items += 1; break; case CEPH_OSD_OP_CALL: *num_request_data_items += 2; *num_reply_data_items += 1; break; default: WARN_ON(!osd_req_opcode_valid(op->op)); break; } } } /* * oid, oloc and OSD op opcode(s) must be filled in before this function * is called. */ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) { int num_request_data_items, num_reply_data_items; get_num_data_items(req, &num_request_data_items, &num_reply_data_items); return __ceph_osdc_alloc_messages(req, gfp, num_request_data_items, num_reply_data_items); } EXPORT_SYMBOL(ceph_osdc_alloc_messages); /* * This is an osd op init function for opcodes that have no data or * other information associated with them. It also serves as a * common init routine for all the other init functions, below. */ struct ceph_osd_req_op * osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, u32 flags) { struct ceph_osd_req_op *op; BUG_ON(which >= osd_req->r_num_ops); BUG_ON(!osd_req_opcode_valid(opcode)); op = &osd_req->r_ops[which]; memset(op, 0, sizeof (*op)); op->op = opcode; op->flags = flags; return op; } EXPORT_SYMBOL(osd_req_op_init); void osd_req_op_extent_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, u64 offset, u64 length, u64 truncate_size, u32 truncate_seq) { struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which, opcode, 0); size_t payload_len = 0; BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE && opcode != CEPH_OSD_OP_WRITEFULL && opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE && opcode != CEPH_OSD_OP_SPARSE_READ); op->extent.offset = offset; op->extent.length = length; op->extent.truncate_size = truncate_size; op->extent.truncate_seq = truncate_seq; if (opcode == CEPH_OSD_OP_WRITE || opcode == CEPH_OSD_OP_WRITEFULL) payload_len += length; op->indata_len = payload_len; } EXPORT_SYMBOL(osd_req_op_extent_init); void osd_req_op_extent_update(struct ceph_osd_request *osd_req, unsigned int which, u64 length) { struct ceph_osd_req_op *op; u64 previous; BUG_ON(which >= osd_req->r_num_ops); op = &osd_req->r_ops[which]; previous = op->extent.length; if (length == previous) return; /* Nothing to do */ BUG_ON(length > previous); op->extent.length = length; if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL) op->indata_len -= previous - length; } EXPORT_SYMBOL(osd_req_op_extent_update); void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req, unsigned int which, u64 offset_inc) { struct ceph_osd_req_op *op, *prev_op; BUG_ON(which + 1 >= osd_req->r_num_ops); prev_op = &osd_req->r_ops[which]; op = osd_req_op_init(osd_req, which + 1, prev_op->op, prev_op->flags); /* dup previous one */ op->indata_len = prev_op->indata_len; op->outdata_len = prev_op->outdata_len; op->extent = prev_op->extent; /* adjust offset */ op->extent.offset += offset_inc; op->extent.length -= offset_inc; if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL) op->indata_len -= offset_inc; } EXPORT_SYMBOL(osd_req_op_extent_dup_last); int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, const char *class, const char *method) { struct ceph_osd_req_op *op; struct ceph_pagelist *pagelist; size_t payload_len = 0; size_t size; int ret; op = osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0); pagelist = ceph_pagelist_alloc(GFP_NOFS); if (!pagelist) return -ENOMEM; op->cls.class_name = class; size = strlen(class); BUG_ON(size > (size_t) U8_MAX); op->cls.class_len = size; ret = ceph_pagelist_append(pagelist, class, size); if (ret) goto err_pagelist_free; payload_len += size; op->cls.method_name = method; size = strlen(method); BUG_ON(size > (size_t) U8_MAX); op->cls.method_len = size; ret = ceph_pagelist_append(pagelist, method, size); if (ret) goto err_pagelist_free; payload_len += size; osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); op->indata_len = payload_len; return 0; err_pagelist_free: ceph_pagelist_release(pagelist); return ret; } EXPORT_SYMBOL(osd_req_op_cls_init); int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, const char *name, const void *value, size_t size, u8 cmp_op, u8 cmp_mode) { struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which, opcode, 0); struct ceph_pagelist *pagelist; size_t payload_len; int ret; BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR); pagelist = ceph_pagelist_alloc(GFP_NOFS); if (!pagelist) return -ENOMEM; payload_len = strlen(name); op->xattr.name_len = payload_len; ret = ceph_pagelist_append(pagelist, name, payload_len); if (ret) goto err_pagelist_free; op->xattr.value_len = size; ret = ceph_pagelist_append(pagelist, value, size); if (ret) goto err_pagelist_free; payload_len += size; op->xattr.cmp_op = cmp_op; op->xattr.cmp_mode = cmp_mode; ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist); op->indata_len = payload_len; return 0; err_pagelist_free: ceph_pagelist_release(pagelist); return ret; } EXPORT_SYMBOL(osd_req_op_xattr_init); /* * @watch_opcode: CEPH_OSD_WATCH_OP_* */ static void osd_req_op_watch_init(struct ceph_osd_request *req, int which, u8 watch_opcode, u64 cookie, u32 gen) { struct ceph_osd_req_op *op; op = osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0); op->watch.cookie = cookie; op->watch.op = watch_opcode; op->watch.gen = gen; } /* * prot_ver, timeout and notify payload (may be empty) should already be * encoded in @request_pl */ static void osd_req_op_notify_init(struct ceph_osd_request *req, int which, u64 cookie, struct ceph_pagelist *request_pl) { struct ceph_osd_req_op *op; op = osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0); op->notify.cookie = cookie; ceph_osd_data_pagelist_init(&op->notify.request_data, request_pl); op->indata_len = request_pl->length; } /* * @flags: CEPH_OSD_OP_ALLOC_HINT_FLAG_* */ void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, unsigned int which, u64 expected_object_size, u64 expected_write_size, u32 flags) { struct ceph_osd_req_op *op; op = osd_req_op_init(osd_req, which, CEPH_OSD_OP_SETALLOCHINT, 0); op->alloc_hint.expected_object_size = expected_object_size; op->alloc_hint.expected_write_size = expected_write_size; op->alloc_hint.flags = flags; /* * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed * not worth a feature bit. Set FAILOK per-op flag to make * sure older osds don't trip over an unsupported opcode. */ op->flags |= CEPH_OSD_OP_FLAG_FAILOK; } EXPORT_SYMBOL(osd_req_op_alloc_hint_init); static void ceph_osdc_msg_data_add(struct ceph_msg *msg, struct ceph_osd_data *osd_data) { u64 length = ceph_osd_data_length(osd_data); if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { BUG_ON(length > (u64) SIZE_MAX); if (length) ceph_msg_data_add_pages(msg, osd_data->pages, length, osd_data->alignment, false); } else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) { BUG_ON(!length); ceph_msg_data_add_pagelist(msg, osd_data->pagelist); #ifdef CONFIG_BLOCK } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { ceph_msg_data_add_bio(msg, &osd_data->bio_pos, length); #endif } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BVECS) { ceph_msg_data_add_bvecs(msg, &osd_data->bvec_pos); } else if (osd_data->type == CEPH_OSD_DATA_TYPE_ITER) { ceph_msg_data_add_iter(msg, &osd_data->iter); } else { BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE); } } static u32 osd_req_encode_op(struct ceph_osd_op *dst, const struct ceph_osd_req_op *src) { switch (src->op) { case CEPH_OSD_OP_STAT: break; case CEPH_OSD_OP_READ: case CEPH_OSD_OP_SPARSE_READ: case CEPH_OSD_OP_WRITE: case CEPH_OSD_OP_WRITEFULL: case CEPH_OSD_OP_ZERO: case CEPH_OSD_OP_TRUNCATE: dst->extent.offset = cpu_to_le64(src->extent.offset); dst->extent.length = cpu_to_le64(src->extent.length); dst->extent.truncate_size = cpu_to_le64(src->extent.truncate_size); dst->extent.truncate_seq = cpu_to_le32(src->extent.truncate_seq); break; case CEPH_OSD_OP_CALL: dst->cls.class_len = src->cls.class_len; dst->cls.method_len = src->cls.method_len; dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); break; case CEPH_OSD_OP_WATCH: dst->watch.cookie = cpu_to_le64(src->watch.cookie); dst->watch.ver = cpu_to_le64(0); dst->watch.op = src->watch.op; dst->watch.gen = cpu_to_le32(src->watch.gen); break; case CEPH_OSD_OP_NOTIFY_ACK: break; case CEPH_OSD_OP_NOTIFY: dst->notify.cookie = cpu_to_le64(src->notify.cookie); break; case CEPH_OSD_OP_LIST_WATCHERS: break; case CEPH_OSD_OP_SETALLOCHINT: dst->alloc_hint.expected_object_size = cpu_to_le64(src->alloc_hint.expected_object_size); dst->alloc_hint.expected_write_size = cpu_to_le64(src->alloc_hint.expected_write_size); dst->alloc_hint.flags = cpu_to_le32(src->alloc_hint.flags); break; case CEPH_OSD_OP_SETXATTR: case CEPH_OSD_OP_CMPXATTR: dst->xattr.name_len = cpu_to_le32(src->xattr.name_len); dst->xattr.value_len = cpu_to_le32(src->xattr.value_len); dst->xattr.cmp_op = src->xattr.cmp_op; dst->xattr.cmp_mode = src->xattr.cmp_mode; break; case CEPH_OSD_OP_CREATE: case CEPH_OSD_OP_DELETE: break; case CEPH_OSD_OP_COPY_FROM2: dst->copy_from.snapid = cpu_to_le64(src->copy_from.snapid); dst->copy_from.src_version = cpu_to_le64(src->copy_from.src_version); dst->copy_from.flags = src->copy_from.flags; dst->copy_from.src_fadvise_flags = cpu_to_le32(src->copy_from.src_fadvise_flags); break; case CEPH_OSD_OP_ASSERT_VER: dst->assert_ver.unused = cpu_to_le64(0); dst->assert_ver.ver = cpu_to_le64(src->assert_ver.ver); break; default: pr_err("unsupported osd opcode %s\n", ceph_osd_op_name(src->op)); WARN_ON(1); return 0; } dst->op = cpu_to_le16(src->op); dst->flags = cpu_to_le32(src->flags); dst->payload_len = cpu_to_le32(src->indata_len); return src->indata_len; } /* * build new request AND message, calculate layout, and adjust file * extent as needed. * * if the file was recently truncated, we include information about its * old and new size so that the object can be updated appropriately. (we * avoid synchronously deleting truncated objects because it's slow.) */ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, struct ceph_file_layout *layout, struct ceph_vino vino, u64 off, u64 *plen, unsigned int which, int num_ops, int opcode, int flags, struct ceph_snap_context *snapc, u32 truncate_seq, u64 truncate_size, bool use_mempool) { struct ceph_osd_request *req; u64 objnum = 0; u64 objoff = 0; u64 objlen = 0; int r; BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE && opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE && opcode != CEPH_OSD_OP_CREATE && opcode != CEPH_OSD_OP_DELETE && opcode != CEPH_OSD_OP_SPARSE_READ); req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool, GFP_NOFS); if (!req) { r = -ENOMEM; goto fail; } /* calculate max write size */ r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen); if (r) goto fail; if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) { osd_req_op_init(req, which, opcode, 0); } else { u32 object_size = layout->object_size; u32 object_base = off - objoff; if (!(truncate_seq == 1 && truncate_size == -1ULL)) { if (truncate_size <= object_base) { truncate_size = 0; } else { truncate_size -= object_base; if (truncate_size > object_size) truncate_size = object_size; } } osd_req_op_extent_init(req, which, opcode, objoff, objlen, truncate_size, truncate_seq); } req->r_base_oloc.pool = layout->pool_id; req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns); ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum); req->r_flags = flags | osdc->client->options->read_from_replica; req->r_snapid = vino.snap; if (flags & CEPH_OSD_FLAG_WRITE) req->r_data_offset = off; if (num_ops > 1) { int num_req_ops, num_rep_ops; /* * If this is a multi-op write request, assume that we'll need * request ops. If it's a multi-op read then assume we'll need * reply ops. Anything else and call it -EINVAL. */ if (flags & CEPH_OSD_FLAG_WRITE) { num_req_ops = num_ops; num_rep_ops = 0; } else if (flags & CEPH_OSD_FLAG_READ) { num_req_ops = 0; num_rep_ops = num_ops; } else { r = -EINVAL; goto fail; } r = __ceph_osdc_alloc_messages(req, GFP_NOFS, num_req_ops, num_rep_ops); } else { r = ceph_osdc_alloc_messages(req, GFP_NOFS); } if (r) goto fail; return req; fail: ceph_osdc_put_request(req); return ERR_PTR(r); } EXPORT_SYMBOL(ceph_osdc_new_request); int __ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt) { WARN_ON(op->op != CEPH_OSD_OP_SPARSE_READ); op->extent.sparse_ext_cnt = cnt; op->extent.sparse_ext = kmalloc_array(cnt, sizeof(*op->extent.sparse_ext), GFP_NOFS); if (!op->extent.sparse_ext) return -ENOMEM; return 0; } EXPORT_SYMBOL(__ceph_alloc_sparse_ext_map); /* * We keep osd requests in an rbtree, sorted by ->r_tid. */ DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node) DEFINE_RB_FUNCS(request_mc, struct ceph_osd_request, r_tid, r_mc_node) /* * Call @fn on each OSD request as long as @fn returns 0. */ static void for_each_request(struct ceph_osd_client *osdc, int (*fn)(struct ceph_osd_request *req, void *arg), void *arg) { struct rb_node *n, *p; for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); for (p = rb_first(&osd->o_requests); p; ) { struct ceph_osd_request *req = rb_entry(p, struct ceph_osd_request, r_node); p = rb_next(p); if (fn(req, arg)) return; } } for (p = rb_first(&osdc->homeless_osd.o_requests); p; ) { struct ceph_osd_request *req = rb_entry(p, struct ceph_osd_request, r_node); p = rb_next(p); if (fn(req, arg)) return; } } static bool osd_homeless(struct ceph_osd *osd) { return osd->o_osd == CEPH_HOMELESS_OSD; } static bool osd_registered(struct ceph_osd *osd) { verify_osdc_locked(osd->o_osdc); return !RB_EMPTY_NODE(&osd->o_node); } /* * Assumes @osd is zero-initialized. */ static void osd_init(struct ceph_osd *osd) { refcount_set(&osd->o_ref, 1); RB_CLEAR_NODE(&osd->o_node); spin_lock_init(&osd->o_requests_lock); osd->o_requests = RB_ROOT; osd->o_linger_requests = RB_ROOT; osd->o_backoff_mappings = RB_ROOT; osd->o_backoffs_by_id = RB_ROOT; INIT_LIST_HEAD(&osd->o_osd_lru); INIT_LIST_HEAD(&osd->o_keepalive_item); osd->o_incarnation = 1; mutex_init(&osd->lock); } static void ceph_init_sparse_read(struct ceph_sparse_read *sr) { kfree(sr->sr_extent); memset(sr, '\0', sizeof(*sr)); sr->sr_state = CEPH_SPARSE_READ_HDR; } static void osd_cleanup(struct ceph_osd *osd) { WARN_ON(!RB_EMPTY_NODE(&osd->o_node)); WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests)); WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests)); WARN_ON(!RB_EMPTY_ROOT(&osd->o_backoff_mappings)); WARN_ON(!RB_EMPTY_ROOT(&osd->o_backoffs_by_id)); WARN_ON(!list_empty(&osd->o_osd_lru)); WARN_ON(!list_empty(&osd->o_keepalive_item)); ceph_init_sparse_read(&osd->o_sparse_read); if (osd->o_auth.authorizer) { WARN_ON(osd_homeless(osd)); ceph_auth_destroy_authorizer(osd->o_auth.authorizer); } } /* * Track open sessions with osds. */ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum) { struct ceph_osd *osd; WARN_ON(onum == CEPH_HOMELESS_OSD); osd = kzalloc(sizeof(*osd), GFP_NOIO | __GFP_NOFAIL); osd_init(osd); osd->o_osdc = osdc; osd->o_osd = onum; osd->o_sparse_op_idx = -1; ceph_init_sparse_read(&osd->o_sparse_read); ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr); return osd; } static struct ceph_osd *get_osd(struct ceph_osd *osd) { if (refcount_inc_not_zero(&osd->o_ref)) { dout("get_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref)-1, refcount_read(&osd->o_ref)); return osd; } else { dout("get_osd %p FAIL\n", osd); return NULL; } } static void put_osd(struct ceph_osd *osd) { dout("put_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref), refcount_read(&osd->o_ref) - 1); if (refcount_dec_and_test(&osd->o_ref)) { osd_cleanup(osd); kfree(osd); } } DEFINE_RB_FUNCS(osd, struct ceph_osd, o_osd, o_node) static void __move_osd_to_lru(struct ceph_osd *osd) { struct ceph_osd_client *osdc = osd->o_osdc; dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd); BUG_ON(!list_empty(&osd->o_osd_lru)); spin_lock(&osdc->osd_lru_lock); list_add_tail(&osd->o_osd_lru, &osdc->osd_lru); spin_unlock(&osdc->osd_lru_lock); osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl; } static void maybe_move_osd_to_lru(struct ceph_osd *osd) { if (RB_EMPTY_ROOT(&osd->o_requests) && RB_EMPTY_ROOT(&osd->o_linger_requests)) __move_osd_to_lru(osd); } static void __remove_osd_from_lru(struct ceph_osd *osd) { struct ceph_osd_client *osdc = osd->o_osdc; dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd); spin_lock(&osdc->osd_lru_lock); if (!list_empty(&osd->o_osd_lru)) list_del_init(&osd->o_osd_lru); spin_unlock(&osdc->osd_lru_lock); } /* * Close the connection and assign any leftover requests to the * homeless session. */ static void close_osd(struct ceph_osd *osd) { struct ceph_osd_client *osdc = osd->o_osdc; struct rb_node *n; verify_osdc_wrlocked(osdc); dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd); ceph_con_close(&osd->o_con); for (n = rb_first(&osd->o_requests); n; ) { struct ceph_osd_request *req = rb_entry(n, struct ceph_osd_request, r_node); n = rb_next(n); /* unlink_request() */ dout(" reassigning req %p tid %llu\n", req, req->r_tid); unlink_request(osd, req); link_request(&osdc->homeless_osd, req); } for (n = rb_first(&osd->o_linger_requests); n; ) { struct ceph_osd_linger_request *lreq = rb_entry(n, struct ceph_osd_linger_request, node); n = rb_next(n); /* unlink_linger() */ dout(" reassigning lreq %p linger_id %llu\n", lreq, lreq->linger_id); unlink_linger(osd, lreq); link_linger(&osdc->homeless_osd, lreq); } clear_backoffs(osd); __remove_osd_from_lru(osd); erase_osd(&osdc->osds, osd); put_osd(osd); } /* * reset osd connect */ static int reopen_osd(struct ceph_osd *osd) { struct ceph_entity_addr *peer_addr; dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd); if (RB_EMPTY_ROOT(&osd->o_requests) && RB_EMPTY_ROOT(&osd->o_linger_requests)) { close_osd(osd); return -ENODEV; } peer_addr = &osd->o_osdc->osdmap->osd_addr[osd->o_osd]; if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) && !ceph_con_opened(&osd->o_con)) { struct rb_node *n; dout("osd addr hasn't changed and connection never opened, " "letting msgr retry\n"); /* touch each r_stamp for handle_timeout()'s benfit */ for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) { struct ceph_osd_request *req = rb_entry(n, struct ceph_osd_request, r_node); req->r_stamp = jiffies; } return -EAGAIN; } ceph_con_close(&osd->o_con); ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, peer_addr); osd->o_incarnation++; return 0; } static struct ceph_osd *lookup_create_osd(struct ceph_osd_client *osdc, int o, bool wrlocked) { struct ceph_osd *osd; if (wrlocked) verify_osdc_wrlocked(osdc); else verify_osdc_locked(osdc); if (o != CEPH_HOMELESS_OSD) osd = lookup_osd(&osdc->osds, o); else osd = &osdc->homeless_osd; if (!osd) { if (!wrlocked) return ERR_PTR(-EAGAIN); osd = create_osd(osdc, o); insert_osd(&osdc->osds, osd); ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, &osdc->osdmap->osd_addr[osd->o_osd]); } dout("%s osdc %p osd%d -> osd %p\n", __func__, osdc, o, osd); return osd; } /* * Create request <-> OSD session relation. * * @req has to be assigned a tid, @osd may be homeless. */ static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req) { verify_osd_locked(osd); WARN_ON(!req->r_tid || req->r_osd); dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd, req, req->r_tid); if (!osd_homeless(osd)) __remove_osd_from_lru(osd); else atomic_inc(&osd->o_osdc->num_homeless); get_osd(osd); spin_lock(&osd->o_requests_lock); insert_request(&osd->o_requests, req); spin_unlock(&osd->o_requests_lock); req->r_osd = osd; } static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req) { verify_osd_locked(osd); WARN_ON(req->r_osd != osd); dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd, req, req->r_tid); req->r_osd = NULL; spin_lock(&osd->o_requests_lock); erase_request(&osd->o_requests, req); spin_unlock(&osd->o_requests_lock); put_osd(osd); if (!osd_homeless(osd)) maybe_move_osd_to_lru(osd); else atomic_dec(&osd->o_osdc->num_homeless); } static bool __pool_full(struct ceph_pg_pool_info *pi) { return pi->flags & CEPH_POOL_FLAG_FULL; } static bool have_pool_full(struct ceph_osd_client *osdc) { struct rb_node *n; for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) { struct ceph_pg_pool_info *pi = rb_entry(n, struct ceph_pg_pool_info, node); if (__pool_full(pi)) return true; } return false; } static bool pool_full(struct ceph_osd_client *osdc, s64 pool_id) { struct ceph_pg_pool_info *pi; pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id); if (!pi) return false; return __pool_full(pi); } /* * Returns whether a request should be blocked from being sent * based on the current osdmap and osd_client settings. */ static bool target_should_be_paused(struct ceph_osd_client *osdc, const struct ceph_osd_request_target *t, struct ceph_pg_pool_info *pi) { bool pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD); bool pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) || ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || __pool_full(pi); WARN_ON(pi->id != t->target_oloc.pool); return ((t->flags & CEPH_OSD_FLAG_READ) && pauserd) || ((t->flags & CEPH_OSD_FLAG_WRITE) && pausewr) || (osdc->osdmap->epoch < osdc->epoch_barrier); } static int pick_random_replica(const struct ceph_osds *acting) { int i = get_random_u32_below(acting->size); dout("%s picked osd%d, primary osd%d\n", __func__, acting->osds[i], acting->primary); return i; } /* * Picks the closest replica based on client's location given by * crush_location option. Prefers the primary if the locality is * the same. */ static int pick_closest_replica(struct ceph_osd_client *osdc, const struct ceph_osds *acting) { struct ceph_options *opt = osdc->client->options; int best_i, best_locality; int i = 0, locality; do { locality = ceph_get_crush_locality(osdc->osdmap, acting->osds[i], &opt->crush_locs); if (i == 0 || (locality >= 0 && best_locality < 0) || (locality >= 0 && best_locality >= 0 && locality < best_locality)) { best_i = i; best_locality = locality; } } while (++i < acting->size); dout("%s picked osd%d with locality %d, primary osd%d\n", __func__, acting->osds[best_i], best_locality, acting->primary); return best_i; } enum calc_target_result { CALC_TARGET_NO_ACTION = 0, CALC_TARGET_NEED_RESEND, CALC_TARGET_POOL_DNE, }; static enum calc_target_result calc_target(struct ceph_osd_client *osdc, struct ceph_osd_request_target *t, bool any_change) { struct ceph_pg_pool_info *pi; struct ceph_pg pgid, last_pgid; struct ceph_osds up, acting; bool is_read = t->flags & CEPH_OSD_FLAG_READ; bool is_write = t->flags & CEPH_OSD_FLAG_WRITE; bool force_resend = false; bool unpaused = false; bool legacy_change = false; bool split = false; bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE); bool recovery_deletes = ceph_osdmap_flag(osdc, CEPH_OSDMAP_RECOVERY_DELETES); enum calc_target_result ct_res; t->epoch = osdc->osdmap->epoch; pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool); if (!pi) { t->osd = CEPH_HOMELESS_OSD; ct_res = CALC_TARGET_POOL_DNE; goto out; } if (osdc->osdmap->epoch == pi->last_force_request_resend) { if (t->last_force_resend < pi->last_force_request_resend) { t->last_force_resend = pi->last_force_request_resend; force_resend = true; } else if (t->last_force_resend == 0) { force_resend = true; } } /* apply tiering */ ceph_oid_copy(&t->target_oid, &t->base_oid); ceph_oloc_copy(&t->target_oloc, &t->base_oloc); if ((t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { if (is_read && pi->read_tier >= 0) t->target_oloc.pool = pi->read_tier; if (is_write && pi->write_tier >= 0) t->target_oloc.pool = pi->write_tier; pi = ceph_pg_pool_by_id(osdc->osdmap, t->target_oloc.pool); if (!pi) { t->osd = CEPH_HOMELESS_OSD; ct_res = CALC_TARGET_POOL_DNE; goto out; } } __ceph_object_locator_to_pg(pi, &t->target_oid, &t->target_oloc, &pgid); last_pgid.pool = pgid.pool; last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask); ceph_pg_to_up_acting_osds(osdc->osdmap, pi, &pgid, &up, &acting); if (any_change && ceph_is_new_interval(&t->acting, &acting, &t->up, &up, t->size, pi->size, t->min_size, pi->min_size, t->pg_num, pi->pg_num, t->sort_bitwise, sort_bitwise, t->recovery_deletes, recovery_deletes, &last_pgid)) force_resend = true; if (t->paused && !target_should_be_paused(osdc, t, pi)) { t->paused = false; unpaused = true; } legacy_change = ceph_pg_compare(&t->pgid, &pgid) || ceph_osds_changed(&t->acting, &acting, t->used_replica || any_change); if (t->pg_num) split = ceph_pg_is_split(&last_pgid, t->pg_num, pi->pg_num); if (legacy_change || force_resend || split) { t->pgid = pgid; /* struct */ ceph_pg_to_primary_shard(osdc->osdmap, pi, &pgid, &t->spgid); ceph_osds_copy(&t->acting, &acting); ceph_osds_copy(&t->up, &up); t->size = pi->size; t->min_size = pi->min_size; t->pg_num = pi->pg_num; t->pg_num_mask = pi->pg_num_mask; t->sort_bitwise = sort_bitwise; t->recovery_deletes = recovery_deletes; if ((t->flags & (CEPH_OSD_FLAG_BALANCE_READS | CEPH_OSD_FLAG_LOCALIZE_READS)) && !is_write && pi->type == CEPH_POOL_TYPE_REP && acting.size > 1) { int pos; WARN_ON(!is_read || acting.osds[0] != acting.primary); if (t->flags & CEPH_OSD_FLAG_BALANCE_READS) { pos = pick_random_replica(&acting); } else { pos = pick_closest_replica(osdc, &acting); } t->osd = acting.osds[pos]; t->used_replica = pos > 0; } else { t->osd = acting.primary; t->used_replica = false; } } if (unpaused || legacy_change || force_resend || split) ct_res = CALC_TARGET_NEED_RESEND; else ct_res = CALC_TARGET_NO_ACTION; out: dout("%s t %p -> %d%d%d%d ct_res %d osd%d\n", __func__, t, unpaused, legacy_change, force_resend, split, ct_res, t->osd); return ct_res; } static struct ceph_spg_mapping *alloc_spg_mapping(void) { struct ceph_spg_mapping *spg; spg = kmalloc(sizeof(*spg), GFP_NOIO); if (!spg) return NULL; RB_CLEAR_NODE(&spg->node); spg->backoffs = RB_ROOT; return spg; } static void free_spg_mapping(struct ceph_spg_mapping *spg) { WARN_ON(!RB_EMPTY_NODE(&spg->node)); WARN_ON(!RB_EMPTY_ROOT(&spg->backoffs)); kfree(spg); } /* * rbtree of ceph_spg_mapping for handling map<spg_t, ...>, similar to * ceph_pg_mapping. Used to track OSD backoffs -- a backoff [range] is * defined only within a specific spgid; it does not pass anything to * children on split, or to another primary. */ DEFINE_RB_FUNCS2(spg_mapping, struct ceph_spg_mapping, spgid, ceph_spg_compare, RB_BYPTR, const struct ceph_spg *, node) static u64 hoid_get_bitwise_key(const struct ceph_hobject_id *hoid) { return hoid->is_max ? 0x100000000ull : hoid->hash_reverse_bits; } static void hoid_get_effective_key(const struct ceph_hobject_id *hoid, void **pkey, size_t *pkey_len) { if (hoid->key_len) { *pkey = hoid->key; *pkey_len = hoid->key_len; } else { *pkey = hoid->oid; *pkey_len = hoid->oid_len; } } static int compare_names(const void *name1, size_t name1_len, const void *name2, size_t name2_len) { int ret; ret = memcmp(name1, name2, min(name1_len, name2_len)); if (!ret) { if (name1_len < name2_len) ret = -1; else if (name1_len > name2_len) ret = 1; } return ret; } static int hoid_compare(const struct ceph_hobject_id *lhs, const struct ceph_hobject_id *rhs) { void *effective_key1, *effective_key2; size_t effective_key1_len, effective_key2_len; int ret; if (lhs->is_max < rhs->is_max) return -1; if (lhs->is_max > rhs->is_max) return 1; if (lhs->pool < rhs->pool) return -1; if (lhs->pool > rhs->pool) return 1; if (hoid_get_bitwise_key(lhs) < hoid_get_bitwise_key(rhs)) return -1; if (hoid_get_bitwise_key(lhs) > hoid_get_bitwise_key(rhs)) return 1; ret = compare_names(lhs->nspace, lhs->nspace_len, rhs->nspace, rhs->nspace_len); if (ret) return ret; hoid_get_effective_key(lhs, &effective_key1, &effective_key1_len); hoid_get_effective_key(rhs, &effective_key2, &effective_key2_len); ret = compare_names(effective_key1, effective_key1_len, effective_key2, effective_key2_len); if (ret) return ret; ret = compare_names(lhs->oid, lhs->oid_len, rhs->oid, rhs->oid_len); if (ret) return ret; if (lhs->snapid < rhs->snapid) return -1; if (lhs->snapid > rhs->snapid) return 1; return 0; } /* * For decoding ->begin and ->end of MOSDBackoff only -- no MIN/MAX * compat stuff here. * * Assumes @hoid is zero-initialized. */ static int decode_hoid(void **p, void *end, struct ceph_hobject_id *hoid) { u8 struct_v; u32 struct_len; int ret; ret = ceph_start_decoding(p, end, 4, "hobject_t", &struct_v, &struct_len); if (ret) return ret; if (struct_v < 4) { pr_err("got struct_v %d < 4 of hobject_t\n", struct_v); goto e_inval; } hoid->key = ceph_extract_encoded_string(p, end, &hoid->key_len, GFP_NOIO); if (IS_ERR(hoid->key)) { ret = PTR_ERR(hoid->key); hoid->key = NULL; return ret; } hoid->oid = ceph_extract_encoded_string(p, end, &hoid->oid_len, GFP_NOIO); if (IS_ERR(hoid->oid)) { ret = PTR_ERR(hoid->oid); hoid->oid = NULL; return ret; } ceph_decode_64_safe(p, end, hoid->snapid, e_inval); ceph_decode_32_safe(p, end, hoid->hash, e_inval); ceph_decode_8_safe(p, end, hoid->is_max, e_inval); hoid->nspace = ceph_extract_encoded_string(p, end, &hoid->nspace_len, GFP_NOIO); if (IS_ERR(hoid->nspace)) { ret = PTR_ERR(hoid->nspace); hoid->nspace = NULL; return ret; } ceph_decode_64_safe(p, end, hoid->pool, e_inval); ceph_hoid_build_hash_cache(hoid); return 0; e_inval: return -EINVAL; } static int hoid_encoding_size(const struct ceph_hobject_id *hoid) { return 8 + 4 + 1 + 8 + /* snapid, hash, is_max, pool */ 4 + hoid->key_len + 4 + hoid->oid_len + 4 + hoid->nspace_len; } static void encode_hoid(void **p, void *end, const struct ceph_hobject_id *hoid) { ceph_start_encoding(p, 4, 3, hoid_encoding_size(hoid)); ceph_encode_string(p, end, hoid->key, hoid->key_len); ceph_encode_string(p, end, hoid->oid, hoid->oid_len); ceph_encode_64(p, hoid->snapid); ceph_encode_32(p, hoid->hash); ceph_encode_8(p, hoid->is_max); ceph_encode_string(p, end, hoid->nspace, hoid->nspace_len); ceph_encode_64(p, hoid->pool); } static void free_hoid(struct ceph_hobject_id *hoid) { if (hoid) { kfree(hoid->key); kfree(hoid->oid); kfree(hoid->nspace); kfree(hoid); } } static struct ceph_osd_backoff *alloc_backoff(void) { struct ceph_osd_backoff *backoff; backoff = kzalloc(sizeof(*backoff), GFP_NOIO); if (!backoff) return NULL; RB_CLEAR_NODE(&backoff->spg_node); RB_CLEAR_NODE(&backoff->id_node); return backoff; } static void free_backoff(struct ceph_osd_backoff *backoff) { WARN_ON(!RB_EMPTY_NODE(&backoff->spg_node)); WARN_ON(!RB_EMPTY_NODE(&backoff->id_node)); free_hoid(backoff->begin); free_hoid(backoff->end); kfree(backoff); } /* * Within a specific spgid, backoffs are managed by ->begin hoid. */ DEFINE_RB_INSDEL_FUNCS2(backoff, struct ceph_osd_backoff, begin, hoid_compare, RB_BYVAL, spg_node); static struct ceph_osd_backoff *lookup_containing_backoff(struct rb_root *root, const struct ceph_hobject_id *hoid) { struct rb_node *n = root->rb_node; while (n) { struct ceph_osd_backoff *cur = rb_entry(n, struct ceph_osd_backoff, spg_node); int cmp; cmp = hoid_compare(hoid, cur->begin); if (cmp < 0) { n = n->rb_left; } else if (cmp > 0) { if (hoid_compare(hoid, cur->end) < 0) return cur; n = n->rb_right; } else { return cur; } } return NULL; } /* * Each backoff has a unique id within its OSD session. */ DEFINE_RB_FUNCS(backoff_by_id, struct ceph_osd_backoff, id, id_node) static void clear_backoffs(struct ceph_osd *osd) { while (!RB_EMPTY_ROOT(&osd->o_backoff_mappings)) { struct ceph_spg_mapping *spg = rb_entry(rb_first(&osd->o_backoff_mappings), struct ceph_spg_mapping, node); while (!RB_EMPTY_ROOT(&spg->backoffs)) { struct ceph_osd_backoff *backoff = rb_entry(rb_first(&spg->backoffs), struct ceph_osd_backoff, spg_node); erase_backoff(&spg->backoffs, backoff); erase_backoff_by_id(&osd->o_backoffs_by_id, backoff); free_backoff(backoff); } erase_spg_mapping(&osd->o_backoff_mappings, spg); free_spg_mapping(spg); } } /* * Set up a temporary, non-owning view into @t. */ static void hoid_fill_from_target(struct ceph_hobject_id *hoid, const struct ceph_osd_request_target *t) { hoid->key = NULL; hoid->key_len = 0; hoid->oid = t->target_oid.name; hoid->oid_len = t->target_oid.name_len; hoid->snapid = CEPH_NOSNAP; hoid->hash = t->pgid.seed; hoid->is_max = false; if (t->target_oloc.pool_ns) { hoid->nspace = t->target_oloc.pool_ns->str; hoid->nspace_len = t->target_oloc.pool_ns->len; } else { hoid->nspace = NULL; hoid->nspace_len = 0; } hoid->pool = t->target_oloc.pool; ceph_hoid_build_hash_cache(hoid); } static bool should_plug_request(struct ceph_osd_request *req) { struct ceph_osd *osd = req->r_osd; struct ceph_spg_mapping *spg; struct ceph_osd_backoff *backoff; struct ceph_hobject_id hoid; spg = lookup_spg_mapping(&osd->o_backoff_mappings, &req->r_t.spgid); if (!spg) return false; hoid_fill_from_target(&hoid, &req->r_t); backoff = lookup_containing_backoff(&spg->backoffs, &hoid); if (!backoff) return false; dout("%s req %p tid %llu backoff osd%d spgid %llu.%xs%d id %llu\n", __func__, req, req->r_tid, osd->o_osd, backoff->spgid.pgid.pool, backoff->spgid.pgid.seed, backoff->spgid.shard, backoff->id); return true; } /* * Keep get_num_data_items() in sync with this function. */ static void setup_request_data(struct ceph_osd_request *req) { struct ceph_msg *request_msg = req->r_request; struct ceph_msg *reply_msg = req->r_reply; struct ceph_osd_req_op *op; if (req->r_request->num_data_items || req->r_reply->num_data_items) return; WARN_ON(request_msg->data_length || reply_msg->data_length); for (op = req->r_ops; op != &req->r_ops[req->r_num_ops]; op++) { switch (op->op) { /* request */ case CEPH_OSD_OP_WRITE: case CEPH_OSD_OP_WRITEFULL: WARN_ON(op->indata_len != op->extent.length); ceph_osdc_msg_data_add(request_msg, &op->extent.osd_data); break; case CEPH_OSD_OP_SETXATTR: case CEPH_OSD_OP_CMPXATTR: WARN_ON(op->indata_len != op->xattr.name_len + op->xattr.value_len); ceph_osdc_msg_data_add(request_msg, &op->xattr.osd_data); break; case CEPH_OSD_OP_NOTIFY_ACK: ceph_osdc_msg_data_add(request_msg, &op->notify_ack.request_data); break; case CEPH_OSD_OP_COPY_FROM2: ceph_osdc_msg_data_add(request_msg, &op->copy_from.osd_data); break; /* reply */ case CEPH_OSD_OP_STAT: ceph_osdc_msg_data_add(reply_msg, &op->raw_data_in); break; case CEPH_OSD_OP_READ: case CEPH_OSD_OP_SPARSE_READ: ceph_osdc_msg_data_add(reply_msg, &op->extent.osd_data); break; case CEPH_OSD_OP_LIST_WATCHERS: ceph_osdc_msg_data_add(reply_msg, &op->list_watchers.response_data); break; /* both */ case CEPH_OSD_OP_CALL: WARN_ON(op->indata_len != op->cls.class_len + op->cls.method_len + op->cls.indata_len); ceph_osdc_msg_data_add(request_msg, &op->cls.request_info); /* optional, can be NONE */ ceph_osdc_msg_data_add(request_msg, &op->cls.request_data); /* optional, can be NONE */ ceph_osdc_msg_data_add(reply_msg, &op->cls.response_data); break; case CEPH_OSD_OP_NOTIFY: ceph_osdc_msg_data_add(request_msg, &op->notify.request_data); ceph_osdc_msg_data_add(reply_msg, &op->notify.response_data); break; } } } static void encode_pgid(void **p, const struct ceph_pg *pgid) { ceph_encode_8(p, 1); ceph_encode_64(p, pgid->pool); ceph_encode_32(p, pgid->seed); ceph_encode_32(p, -1); /* preferred */ } static void encode_spgid(void **p, const struct ceph_spg *spgid) { ceph_start_encoding(p, 1, 1, CEPH_PGID_ENCODING_LEN + 1); encode_pgid(p, &spgid->pgid); ceph_encode_8(p, spgid->shard); } static void encode_oloc(void **p, void *end, const struct ceph_object_locator *oloc) { ceph_start_encoding(p, 5, 4, ceph_oloc_encoding_size(oloc)); ceph_encode_64(p, oloc->pool); ceph_encode_32(p, -1); /* preferred */ ceph_encode_32(p, 0); /* key len */ if (oloc->pool_ns) ceph_encode_string(p, end, oloc->pool_ns->str, oloc->pool_ns->len); else ceph_encode_32(p, 0); } static void encode_request_partial(struct ceph_osd_request *req, struct ceph_msg *msg) { void *p = msg->front.iov_base; void *const end = p + msg->front_alloc_len; u32 data_len = 0; int i; if (req->r_flags & CEPH_OSD_FLAG_WRITE) { /* snapshots aren't writeable */ WARN_ON(req->r_snapid != CEPH_NOSNAP); } else { WARN_ON(req->r_mtime.tv_sec || req->r_mtime.tv_nsec || req->r_data_offset || req->r_snapc); } setup_request_data(req); encode_spgid(&p, &req->r_t.spgid); /* actual spg */ ceph_encode_32(&p, req->r_t.pgid.seed); /* raw hash */ ceph_encode_32(&p, req->r_osdc->osdmap->epoch); ceph_encode_32(&p, req->r_flags); /* reqid */ ceph_start_encoding(&p, 2, 2, sizeof(struct ceph_osd_reqid)); memset(p, 0, sizeof(struct ceph_osd_reqid)); p += sizeof(struct ceph_osd_reqid); /* trace */ memset(p, 0, sizeof(struct ceph_blkin_trace_info)); p += sizeof(struct ceph_blkin_trace_info); ceph_encode_32(&p, 0); /* client_inc, always 0 */ ceph_encode_timespec64(p, &req->r_mtime); p += sizeof(struct ceph_timespec); encode_oloc(&p, end, &req->r_t.target_oloc); ceph_encode_string(&p, end, req->r_t.target_oid.name, req->r_t.target_oid.name_len); /* ops, can imply data */ ceph_encode_16(&p, req->r_num_ops); for (i = 0; i < req->r_num_ops; i++) { data_len += osd_req_encode_op(p, &req->r_ops[i]); p += sizeof(struct ceph_osd_op); } ceph_encode_64(&p, req->r_snapid); /* snapid */ if (req->r_snapc) { ceph_encode_64(&p, req->r_snapc->seq); ceph_encode_32(&p, req->r_snapc->num_snaps); for (i = 0; i < req->r_snapc->num_snaps; i++) ceph_encode_64(&p, req->r_snapc->snaps[i]); } else { ceph_encode_64(&p, 0); /* snap_seq */ ceph_encode_32(&p, 0); /* snaps len */ } ceph_encode_32(&p, req->r_attempts); /* retry_attempt */ BUG_ON(p > end - 8); /* space for features */ msg->hdr.version = cpu_to_le16(8); /* MOSDOp v8 */ /* front_len is finalized in encode_request_finish() */ msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); msg->hdr.data_len = cpu_to_le32(data_len); /* * The header "data_off" is a hint to the receiver allowing it * to align received data into its buffers such that there's no * need to re-copy it before writing it to disk (direct I/O). */ msg->hdr.data_off = cpu_to_le16(req->r_data_offset); dout("%s req %p msg %p oid %s oid_len %d\n", __func__, req, msg, req->r_t.target_oid.name, req->r_t.target_oid.name_len); } static void encode_request_finish(struct ceph_msg *msg) { void *p = msg->front.iov_base; void *const partial_end = p + msg->front.iov_len; void *const end = p + msg->front_alloc_len; if (CEPH_HAVE_FEATURE(msg->con->peer_features, RESEND_ON_SPLIT)) { /* luminous OSD -- encode features and be done */ p = partial_end; ceph_encode_64(&p, msg->con->peer_features); } else { struct { char spgid[CEPH_ENCODING_START_BLK_LEN + CEPH_PGID_ENCODING_LEN + 1]; __le32 hash; __le32 epoch; __le32 flags; char reqid[CEPH_ENCODING_START_BLK_LEN + sizeof(struct ceph_osd_reqid)]; char trace[sizeof(struct ceph_blkin_trace_info)]; __le32 client_inc; struct ceph_timespec mtime; } __packed head; struct ceph_pg pgid; void *oloc, *oid, *tail; int oloc_len, oid_len, tail_len; int len; /* * Pre-luminous OSD -- reencode v8 into v4 using @head * as a temporary buffer. Encode the raw PG; the rest * is just a matter of moving oloc, oid and tail blobs * around. */ memcpy(&head, p, sizeof(head)); p += sizeof(head); oloc = p; p += CEPH_ENCODING_START_BLK_LEN; pgid.pool = ceph_decode_64(&p); p += 4 + 4; /* preferred, key len */ len = ceph_decode_32(&p); p += len; /* nspace */ oloc_len = p - oloc; oid = p; len = ceph_decode_32(&p); p += len; oid_len = p - oid; tail = p; tail_len = partial_end - p; p = msg->front.iov_base; ceph_encode_copy(&p, &head.client_inc, sizeof(head.client_inc)); ceph_encode_copy(&p, &head.epoch, sizeof(head.epoch)); ceph_encode_copy(&p, &head.flags, sizeof(head.flags)); ceph_encode_copy(&p, &head.mtime, sizeof(head.mtime)); /* reassert_version */ memset(p, 0, sizeof(struct ceph_eversion)); p += sizeof(struct ceph_eversion); BUG_ON(p >= oloc); memmove(p, oloc, oloc_len); p += oloc_len; pgid.seed = le32_to_cpu(head.hash); encode_pgid(&p, &pgid); /* raw pg */ BUG_ON(p >= oid); memmove(p, oid, oid_len); p += oid_len; /* tail -- ops, snapid, snapc, retry_attempt */ BUG_ON(p >= tail); memmove(p, tail, tail_len); p += tail_len; msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */ } BUG_ON(p > end); msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); dout("%s msg %p tid %llu %u+%u+%u v%d\n", __func__, msg, le64_to_cpu(msg->hdr.tid), le32_to_cpu(msg->hdr.front_len), le32_to_cpu(msg->hdr.middle_len), le32_to_cpu(msg->hdr.data_len), le16_to_cpu(msg->hdr.version)); } /* * @req has to be assigned a tid and registered. */ static void send_request(struct ceph_osd_request *req) { struct ceph_osd *osd = req->r_osd; verify_osd_locked(osd); WARN_ON(osd->o_osd != req->r_t.osd); /* backoff? */ if (should_plug_request(req)) return; /* * We may have a previously queued request message hanging * around. Cancel it to avoid corrupting the msgr. */ if (req->r_sent) ceph_msg_revoke(req->r_request); req->r_flags |= CEPH_OSD_FLAG_KNOWN_REDIR; if (req->r_attempts) req->r_flags |= CEPH_OSD_FLAG_RETRY; else WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY); encode_request_partial(req, req->r_request); dout("%s req %p tid %llu to pgid %llu.%x spgid %llu.%xs%d osd%d e%u flags 0x%x attempt %d\n", __func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed, req->r_t.spgid.pgid.pool, req->r_t.spgid.pgid.seed, req->r_t.spgid.shard, osd->o_osd, req->r_t.epoch, req->r_flags, req->r_attempts); req->r_t.paused = false; req->r_stamp = jiffies; req->r_attempts++; req->r_sent = osd->o_incarnation; req->r_request->hdr.tid = cpu_to_le64(req->r_tid); ceph_con_send(&osd->o_con, ceph_msg_get(req->r_request)); } static void maybe_request_map(struct ceph_osd_client *osdc) { bool continuous = false; verify_osdc_locked(osdc); WARN_ON(!osdc->osdmap->epoch); if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD) || ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) { dout("%s osdc %p continuous\n", __func__, osdc); continuous = true; } else { dout("%s osdc %p onetime\n", __func__, osdc); } if (ceph_monc_want_map(&osdc->client->monc, CEPH_SUB_OSDMAP, osdc->osdmap->epoch + 1, continuous)) ceph_monc_renew_subs(&osdc->client->monc); } static void complete_request(struct ceph_osd_request *req, int err); static void send_map_check(struct ceph_osd_request *req); static void __submit_request(struct ceph_osd_request *req, bool wrlocked) { struct ceph_osd_client *osdc = req->r_osdc; struct ceph_osd *osd; enum calc_target_result ct_res; int err = 0; bool need_send = false; bool promoted = false; WARN_ON(req->r_tid); dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); again: ct_res = calc_target(osdc, &req->r_t, false); if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked) goto promote; osd = lookup_create_osd(osdc, req->r_t.osd, wrlocked); if (IS_ERR(osd)) { WARN_ON(PTR_ERR(osd) != -EAGAIN || wrlocked); goto promote; } if (osdc->abort_err) { dout("req %p abort_err %d\n", req, osdc->abort_err); err = osdc->abort_err; } else if (osdc->osdmap->epoch < osdc->epoch_barrier) { dout("req %p epoch %u barrier %u\n", req, osdc->osdmap->epoch, osdc->epoch_barrier); req->r_t.paused = true; maybe_request_map(osdc); } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) { dout("req %p pausewr\n", req); req->r_t.paused = true; maybe_request_map(osdc); } else if ((req->r_flags & CEPH_OSD_FLAG_READ) && ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) { dout("req %p pauserd\n", req); req->r_t.paused = true; maybe_request_map(osdc); } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && !(req->r_flags & (CEPH_OSD_FLAG_FULL_TRY | CEPH_OSD_FLAG_FULL_FORCE)) && (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || pool_full(osdc, req->r_t.base_oloc.pool))) { dout("req %p full/pool_full\n", req); if (ceph_test_opt(osdc->client, ABORT_ON_FULL)) { err = -ENOSPC; } else { if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL)) pr_warn_ratelimited("cluster is full (osdmap FULL)\n"); else pr_warn_ratelimited("pool %lld is full or reached quota\n", req->r_t.base_oloc.pool); req->r_t.paused = true; maybe_request_map(osdc); } } else if (!osd_homeless(osd)) { need_send = true; } else { maybe_request_map(osdc); } mutex_lock(&osd->lock); /* * Assign the tid atomically with send_request() to protect * multiple writes to the same object from racing with each * other, resulting in out of order ops on the OSDs. */ req->r_tid = atomic64_inc_return(&osdc->last_tid); link_request(osd, req); if (need_send) send_request(req); else if (err) complete_request(req, err); mutex_unlock(&osd->lock); if (!err && ct_res == CALC_TARGET_POOL_DNE) send_map_check(req); if (promoted) downgrade_write(&osdc->lock); return; promote: up_read(&osdc->lock); down_write(&osdc->lock); wrlocked = true; promoted = true; goto again; } static void account_request(struct ceph_osd_request *req) { WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK)); WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE))); req->r_flags |= CEPH_OSD_FLAG_ONDISK; atomic_inc(&req->r_osdc->num_requests); req->r_start_stamp = jiffies; req->r_start_latency = ktime_get(); } static void submit_request(struct ceph_osd_request *req, bool wrlocked) { ceph_osdc_get_request(req); account_request(req); __submit_request(req, wrlocked); } static void finish_request(struct ceph_osd_request *req) { struct ceph_osd_client *osdc = req->r_osdc; WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid)); dout("%s req %p tid %llu\n", __func__, req, req->r_tid); req->r_end_latency = ktime_get(); if (req->r_osd) { ceph_init_sparse_read(&req->r_osd->o_sparse_read); unlink_request(req->r_osd, req); } atomic_dec(&osdc->num_requests); /* * If an OSD has failed or returned and a request has been sent * twice, it's possible to get a reply and end up here while the * request message is queued for delivery. We will ignore the * reply, so not a big deal, but better to try and catch it. */ ceph_msg_revoke(req->r_request); ceph_msg_revoke_incoming(req->r_reply); } static void __complete_request(struct ceph_osd_request *req) { dout("%s req %p tid %llu cb %ps result %d\n", __func__, req, req->r_tid, req->r_callback, req->r_result); if (req->r_callback) req->r_callback(req); complete_all(&req->r_completion); ceph_osdc_put_request(req); } static void complete_request_workfn(struct work_struct *work) { struct ceph_osd_request *req = container_of(work, struct ceph_osd_request, r_complete_work); __complete_request(req); } /* * This is open-coded in handle_reply(). */ static void complete_request(struct ceph_osd_request *req, int err) { dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err); req->r_result = err; finish_request(req); INIT_WORK(&req->r_complete_work, complete_request_workfn); queue_work(req->r_osdc->completion_wq, &req->r_complete_work); } static void cancel_map_check(struct ceph_osd_request *req) { struct ceph_osd_client *osdc = req->r_osdc; struct ceph_osd_request *lookup_req; verify_osdc_wrlocked(osdc); lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid); if (!lookup_req) return; WARN_ON(lookup_req != req); erase_request_mc(&osdc->map_checks, req); ceph_osdc_put_request(req); } static void cancel_request(struct ceph_osd_request *req) { dout("%s req %p tid %llu\n", __func__, req, req->r_tid); cancel_map_check(req); finish_request(req); complete_all(&req->r_completion); ceph_osdc_put_request(req); } static void abort_request(struct ceph_osd_request *req, int err) { dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err); cancel_map_check(req); complete_request(req, err); } static int abort_fn(struct ceph_osd_request *req, void *arg) { int err = *(int *)arg; abort_request(req, err); return 0; /* continue iteration */ } /* * Abort all in-flight requests with @err and arrange for all future * requests to be failed immediately. */ void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err) { dout("%s osdc %p err %d\n", __func__, osdc, err); down_write(&osdc->lock); for_each_request(osdc, abort_fn, &err); osdc->abort_err = err; up_write(&osdc->lock); } EXPORT_SYMBOL(ceph_osdc_abort_requests); void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc) { down_write(&osdc->lock); osdc->abort_err = 0; up_write(&osdc->lock); } EXPORT_SYMBOL(ceph_osdc_clear_abort_err); static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb) { if (likely(eb > osdc->epoch_barrier)) { dout("updating epoch_barrier from %u to %u\n", osdc->epoch_barrier, eb); osdc->epoch_barrier = eb; /* Request map if we're not to the barrier yet */ if (eb > osdc->osdmap->epoch) maybe_request_map(osdc); } } void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb) { down_read(&osdc->lock); if (unlikely(eb > osdc->epoch_barrier)) { up_read(&osdc->lock); down_write(&osdc->lock); update_epoch_barrier(osdc, eb); up_write(&osdc->lock); } else { up_read(&osdc->lock); } } EXPORT_SYMBOL(ceph_osdc_update_epoch_barrier); /* * We can end up releasing caps as a result of abort_request(). * In that case, we probably want to ensure that the cap release message * has an updated epoch barrier in it, so set the epoch barrier prior to * aborting the first request. */ static int abort_on_full_fn(struct ceph_osd_request *req, void *arg) { struct ceph_osd_client *osdc = req->r_osdc; bool *victims = arg; if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || pool_full(osdc, req->r_t.base_oloc.pool))) { if (!*victims) { update_epoch_barrier(osdc, osdc->osdmap->epoch); *victims = true; } abort_request(req, -ENOSPC); } return 0; /* continue iteration */ } /* * Drop all pending requests that are stalled waiting on a full condition to * clear, and complete them with ENOSPC as the return code. Set the * osdc->epoch_barrier to the latest map epoch that we've seen if any were * cancelled. */ static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc) { bool victims = false; if (ceph_test_opt(osdc->client, ABORT_ON_FULL) && (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || have_pool_full(osdc))) for_each_request(osdc, abort_on_full_fn, &victims); } static void check_pool_dne(struct ceph_osd_request *req) { struct ceph_osd_client *osdc = req->r_osdc; struct ceph_osdmap *map = osdc->osdmap; verify_osdc_wrlocked(osdc); WARN_ON(!map->epoch); if (req->r_attempts) { /* * We sent a request earlier, which means that * previously the pool existed, and now it does not * (i.e., it was deleted). */ req->r_map_dne_bound = map->epoch; dout("%s req %p tid %llu pool disappeared\n", __func__, req, req->r_tid); } else { dout("%s req %p tid %llu map_dne_bound %u have %u\n", __func__, req, req->r_tid, req->r_map_dne_bound, map->epoch); } if (req->r_map_dne_bound) { if (map->epoch >= req->r_map_dne_bound) { /* we had a new enough map */ pr_info_ratelimited("tid %llu pool does not exist\n", req->r_tid); complete_request(req, -ENOENT); } } else { send_map_check(req); } } static void map_check_cb(struct ceph_mon_generic_request *greq) { struct ceph_osd_client *osdc = &greq->monc->client->osdc; struct ceph_osd_request *req; u64 tid = greq->private_data; WARN_ON(greq->result || !greq->u.newest); down_write(&osdc->lock); req = lookup_request_mc(&osdc->map_checks, tid); if (!req) { dout("%s tid %llu dne\n", __func__, tid); goto out_unlock; } dout("%s req %p tid %llu map_dne_bound %u newest %llu\n", __func__, req, req->r_tid, req->r_map_dne_bound, greq->u.newest); if (!req->r_map_dne_bound) req->r_map_dne_bound = greq->u.newest; erase_request_mc(&osdc->map_checks, req); check_pool_dne(req); ceph_osdc_put_request(req); out_unlock: up_write(&osdc->lock); } static void send_map_check(struct ceph_osd_request *req) { struct ceph_osd_client *osdc = req->r_osdc; struct ceph_osd_request *lookup_req; int ret; verify_osdc_wrlocked(osdc); lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid); if (lookup_req) { WARN_ON(lookup_req != req); return; } ceph_osdc_get_request(req); insert_request_mc(&osdc->map_checks, req); ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap", map_check_cb, req->r_tid); WARN_ON(ret); } /* * lingering requests, watch/notify v2 infrastructure */ static void linger_release(struct kref *kref) { struct ceph_osd_linger_request *lreq = container_of(kref, struct ceph_osd_linger_request, kref); dout("%s lreq %p reg_req %p ping_req %p\n", __func__, lreq, lreq->reg_req, lreq->ping_req); WARN_ON(!RB_EMPTY_NODE(&lreq->node)); WARN_ON(!RB_EMPTY_NODE(&lreq->osdc_node)); WARN_ON(!RB_EMPTY_NODE(&lreq->mc_node)); WARN_ON(!list_empty(&lreq->scan_item)); WARN_ON(!list_empty(&lreq->pending_lworks)); WARN_ON(lreq->osd); if (lreq->request_pl) ceph_pagelist_release(lreq->request_pl); if (lreq->notify_id_pages) ceph_release_page_vector(lreq->notify_id_pages, 1); ceph_osdc_put_request(lreq->reg_req); ceph_osdc_put_request(lreq->ping_req); target_destroy(&lreq->t); kfree(lreq); } static void linger_put(struct ceph_osd_linger_request *lreq) { if (lreq) kref_put(&lreq->kref, linger_release); } static struct ceph_osd_linger_request * linger_get(struct ceph_osd_linger_request *lreq) { kref_get(&lreq->kref); return lreq; } static struct ceph_osd_linger_request * linger_alloc(struct ceph_osd_client *osdc) { struct ceph_osd_linger_request *lreq; lreq = kzalloc(sizeof(*lreq), GFP_NOIO); if (!lreq) return NULL; kref_init(&lreq->kref); mutex_init(&lreq->lock); RB_CLEAR_NODE(&lreq->node); RB_CLEAR_NODE(&lreq->osdc_node); RB_CLEAR_NODE(&lreq->mc_node); INIT_LIST_HEAD(&lreq->scan_item); INIT_LIST_HEAD(&lreq->pending_lworks); init_completion(&lreq->reg_commit_wait); init_completion(&lreq->notify_finish_wait); lreq->osdc = osdc; target_init(&lreq->t); dout("%s lreq %p\n", __func__, lreq); return lreq; } DEFINE_RB_INSDEL_FUNCS(linger, struct ceph_osd_linger_request, linger_id, node) DEFINE_RB_FUNCS(linger_osdc, struct ceph_osd_linger_request, linger_id, osdc_node) DEFINE_RB_FUNCS(linger_mc, struct ceph_osd_linger_request, linger_id, mc_node) /* * Create linger request <-> OSD session relation. * * @lreq has to be registered, @osd may be homeless. */ static void link_linger(struct ceph_osd *osd, struct ceph_osd_linger_request *lreq) { verify_osd_locked(osd); WARN_ON(!lreq->linger_id || lreq->osd); dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd, osd->o_osd, lreq, lreq->linger_id); if (!osd_homeless(osd)) __remove_osd_from_lru(osd); else atomic_inc(&osd->o_osdc->num_homeless); get_osd(osd); insert_linger(&osd->o_linger_requests, lreq); lreq->osd = osd; } static void unlink_linger(struct ceph_osd *osd, struct ceph_osd_linger_request *lreq) { verify_osd_locked(osd); WARN_ON(lreq->osd != osd); dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd, osd->o_osd, lreq, lreq->linger_id); lreq->osd = NULL; erase_linger(&osd->o_linger_requests, lreq); put_osd(osd); if (!osd_homeless(osd)) maybe_move_osd_to_lru(osd); else atomic_dec(&osd->o_osdc->num_homeless); } static bool __linger_registered(struct ceph_osd_linger_request *lreq) { verify_osdc_locked(lreq->osdc); return !RB_EMPTY_NODE(&lreq->osdc_node); } static bool linger_registered(struct ceph_osd_linger_request *lreq) { struct ceph_osd_client *osdc = lreq->osdc; bool registered; down_read(&osdc->lock); registered = __linger_registered(lreq); up_read(&osdc->lock); return registered; } static void linger_register(struct ceph_osd_linger_request *lreq) { struct ceph_osd_client *osdc = lreq->osdc; verify_osdc_wrlocked(osdc); WARN_ON(lreq->linger_id); linger_get(lreq); lreq->linger_id = ++osdc->last_linger_id; insert_linger_osdc(&osdc->linger_requests, lreq); } static void linger_unregister(struct ceph_osd_linger_request *lreq) { struct ceph_osd_client *osdc = lreq->osdc; verify_osdc_wrlocked(osdc); erase_linger_osdc(&osdc->linger_requests, lreq); linger_put(lreq); } static void cancel_linger_request(struct ceph_osd_request *req) { struct ceph_osd_linger_request *lreq = req->r_priv; WARN_ON(!req->r_linger); cancel_request(req); linger_put(lreq); } struct linger_work { struct work_struct work; struct ceph_osd_linger_request *lreq; struct list_head pending_item; unsigned long queued_stamp; union { struct { u64 notify_id; u64 notifier_id; void *payload; /* points into @msg front */ size_t payload_len; struct ceph_msg *msg; /* for ceph_msg_put() */ } notify; struct { int err; } error; }; }; static struct linger_work *lwork_alloc(struct ceph_osd_linger_request *lreq, work_func_t workfn) { struct linger_work *lwork; lwork = kzalloc(sizeof(*lwork), GFP_NOIO); if (!lwork) return NULL; INIT_WORK(&lwork->work, workfn); INIT_LIST_HEAD(&lwork->pending_item); lwork->lreq = linger_get(lreq); return lwork; } static void lwork_free(struct linger_work *lwork) { struct ceph_osd_linger_request *lreq = lwork->lreq; mutex_lock(&lreq->lock); list_del(&lwork->pending_item); mutex_unlock(&lreq->lock); linger_put(lreq); kfree(lwork); } static void lwork_queue(struct linger_work *lwork) { struct ceph_osd_linger_request *lreq = lwork->lreq; struct ceph_osd_client *osdc = lreq->osdc; verify_lreq_locked(lreq); WARN_ON(!list_empty(&lwork->pending_item)); lwork->queued_stamp = jiffies; list_add_tail(&lwork->pending_item, &lreq->pending_lworks); queue_work(osdc->notify_wq, &lwork->work); } static void do_watch_notify(struct work_struct *w) { struct linger_work *lwork = container_of(w, struct linger_work, work); struct ceph_osd_linger_request *lreq = lwork->lreq; if (!linger_registered(lreq)) { dout("%s lreq %p not registered\n", __func__, lreq); goto out; } WARN_ON(!lreq->is_watch); dout("%s lreq %p notify_id %llu notifier_id %llu payload_len %zu\n", __func__, lreq, lwork->notify.notify_id, lwork->notify.notifier_id, lwork->notify.payload_len); lreq->wcb(lreq->data, lwork->notify.notify_id, lreq->linger_id, lwork->notify.notifier_id, lwork->notify.payload, lwork->notify.payload_len); out: ceph_msg_put(lwork->notify.msg); lwork_free(lwork); } static void do_watch_error(struct work_struct *w) { struct linger_work *lwork = container_of(w, struct linger_work, work); struct ceph_osd_linger_request *lreq = lwork->lreq; if (!linger_registered(lreq)) { dout("%s lreq %p not registered\n", __func__, lreq); goto out; } dout("%s lreq %p err %d\n", __func__, lreq, lwork->error.err); lreq->errcb(lreq->data, lreq->linger_id, lwork->error.err); out: lwork_free(lwork); } static void queue_watch_error(struct ceph_osd_linger_request *lreq) { struct linger_work *lwork; lwork = lwork_alloc(lreq, do_watch_error); if (!lwork) { pr_err("failed to allocate error-lwork\n"); return; } lwork->error.err = lreq->last_error; lwork_queue(lwork); } static void linger_reg_commit_complete(struct ceph_osd_linger_request *lreq, int result) { if (!completion_done(&lreq->reg_commit_wait)) { lreq->reg_commit_error = (result <= 0 ? result : 0); complete_all(&lreq->reg_commit_wait); } } static void linger_commit_cb(struct ceph_osd_request *req) { struct ceph_osd_linger_request *lreq = req->r_priv; mutex_lock(&lreq->lock); if (req != lreq->reg_req) { dout("%s lreq %p linger_id %llu unknown req (%p != %p)\n", __func__, lreq, lreq->linger_id, req, lreq->reg_req); goto out; } dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq, lreq->linger_id, req->r_result); linger_reg_commit_complete(lreq, req->r_result); lreq->committed = true; if (!lreq->is_watch) { struct ceph_osd_data *osd_data = osd_req_op_data(req, 0, notify, response_data); void *p = page_address(osd_data->pages[0]); WARN_ON(req->r_ops[0].op != CEPH_OSD_OP_NOTIFY || osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); /* make note of the notify_id */ if (req->r_ops[0].outdata_len >= sizeof(u64)) { lreq->notify_id = ceph_decode_64(&p); dout("lreq %p notify_id %llu\n", lreq, lreq->notify_id); } else { dout("lreq %p no notify_id\n", lreq); } } out: mutex_unlock(&lreq->lock); linger_put(lreq); } static int normalize_watch_error(int err) { /* * Translate ENOENT -> ENOTCONN so that a delete->disconnection * notification and a failure to reconnect because we raced with * the delete appear the same to the user. */ if (err == -ENOENT) err = -ENOTCONN; return err; } static void linger_reconnect_cb(struct ceph_osd_request *req) { struct ceph_osd_linger_request *lreq = req->r_priv; mutex_lock(&lreq->lock); if (req != lreq->reg_req) { dout("%s lreq %p linger_id %llu unknown req (%p != %p)\n", __func__, lreq, lreq->linger_id, req, lreq->reg_req); goto out; } dout("%s lreq %p linger_id %llu result %d last_error %d\n", __func__, lreq, lreq->linger_id, req->r_result, lreq->last_error); if (req->r_result < 0) { if (!lreq->last_error) { lreq->last_error = normalize_watch_error(req->r_result); queue_watch_error(lreq); } } out: mutex_unlock(&lreq->lock); linger_put(lreq); } static void send_linger(struct ceph_osd_linger_request *lreq) { struct ceph_osd_client *osdc = lreq->osdc; struct ceph_osd_request *req; int ret; verify_osdc_wrlocked(osdc); mutex_lock(&lreq->lock); dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id); if (lreq->reg_req) { if (lreq->reg_req->r_osd) cancel_linger_request(lreq->reg_req); ceph_osdc_put_request(lreq->reg_req); } req = ceph_osdc_alloc_request(osdc, NULL, 1, true, GFP_NOIO); BUG_ON(!req); target_copy(&req->r_t, &lreq->t); req->r_mtime = lreq->mtime; if (lreq->is_watch && lreq->committed) { osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_RECONNECT, lreq->linger_id, ++lreq->register_gen); dout("lreq %p reconnect register_gen %u\n", lreq, req->r_ops[0].watch.gen); req->r_callback = linger_reconnect_cb; } else { if (lreq->is_watch) { osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_WATCH, lreq->linger_id, 0); } else { lreq->notify_id = 0; refcount_inc(&lreq->request_pl->refcnt); osd_req_op_notify_init(req, 0, lreq->linger_id, lreq->request_pl); ceph_osd_data_pages_init( osd_req_op_data(req, 0, notify, response_data), lreq->notify_id_pages, PAGE_SIZE, 0, false, false); } dout("lreq %p register\n", lreq); req->r_callback = linger_commit_cb; } ret = ceph_osdc_alloc_messages(req, GFP_NOIO); BUG_ON(ret); req->r_priv = linger_get(lreq); req->r_linger = true; lreq->reg_req = req; mutex_unlock(&lreq->lock); submit_request(req, true); } static void linger_ping_cb(struct ceph_osd_request *req) { struct ceph_osd_linger_request *lreq = req->r_priv; mutex_lock(&lreq->lock); if (req != lreq->ping_req) { dout("%s lreq %p linger_id %llu unknown req (%p != %p)\n", __func__, lreq, lreq->linger_id, req, lreq->ping_req); goto out; } dout("%s lreq %p linger_id %llu result %d ping_sent %lu last_error %d\n", __func__, lreq, lreq->linger_id, req->r_result, lreq->ping_sent, lreq->last_error); if (lreq->register_gen == req->r_ops[0].watch.gen) { if (!req->r_result) { lreq->watch_valid_thru = lreq->ping_sent; } else if (!lreq->last_error) { lreq->last_error = normalize_watch_error(req->r_result); queue_watch_error(lreq); } } else { dout("lreq %p register_gen %u ignoring old pong %u\n", lreq, lreq->register_gen, req->r_ops[0].watch.gen); } out: mutex_unlock(&lreq->lock); linger_put(lreq); } static void send_linger_ping(struct ceph_osd_linger_request *lreq) { struct ceph_osd_client *osdc = lreq->osdc; struct ceph_osd_request *req; int ret; if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) { dout("%s PAUSERD\n", __func__); return; } lreq->ping_sent = jiffies; dout("%s lreq %p linger_id %llu ping_sent %lu register_gen %u\n", __func__, lreq, lreq->linger_id, lreq->ping_sent, lreq->register_gen); if (lreq->ping_req) { if (lreq->ping_req->r_osd) cancel_linger_request(lreq->ping_req); ceph_osdc_put_request(lreq->ping_req); } req = ceph_osdc_alloc_request(osdc, NULL, 1, true, GFP_NOIO); BUG_ON(!req); target_copy(&req->r_t, &lreq->t); osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_PING, lreq->linger_id, lreq->register_gen); req->r_callback = linger_ping_cb; ret = ceph_osdc_alloc_messages(req, GFP_NOIO); BUG_ON(ret); req->r_priv = linger_get(lreq); req->r_linger = true; lreq->ping_req = req; ceph_osdc_get_request(req); account_request(req); req->r_tid = atomic64_inc_return(&osdc->last_tid); link_request(lreq->osd, req); send_request(req); } static void linger_submit(struct ceph_osd_linger_request *lreq) { struct ceph_osd_client *osdc = lreq->osdc; struct ceph_osd *osd; down_write(&osdc->lock); linger_register(lreq); calc_target(osdc, &lreq->t, false); osd = lookup_create_osd(osdc, lreq->t.osd, true); link_linger(osd, lreq); send_linger(lreq); up_write(&osdc->lock); } static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq) { struct ceph_osd_client *osdc = lreq->osdc; struct ceph_osd_linger_request *lookup_lreq; verify_osdc_wrlocked(osdc); lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks, lreq->linger_id); if (!lookup_lreq) return; WARN_ON(lookup_lreq != lreq); erase_linger_mc(&osdc->linger_map_checks, lreq); linger_put(lreq); } /* * @lreq has to be both registered and linked. */ static void __linger_cancel(struct ceph_osd_linger_request *lreq) { if (lreq->ping_req && lreq->ping_req->r_osd) cancel_linger_request(lreq->ping_req); if (lreq->reg_req && lreq->reg_req->r_osd) cancel_linger_request(lreq->reg_req); cancel_linger_map_check(lreq); unlink_linger(lreq->osd, lreq); linger_unregister(lreq); } static void linger_cancel(struct ceph_osd_linger_request *lreq) { struct ceph_osd_client *osdc = lreq->osdc; down_write(&osdc->lock); if (__linger_registered(lreq)) __linger_cancel(lreq); up_write(&osdc->lock); } static void send_linger_map_check(struct ceph_osd_linger_request *lreq); static void check_linger_pool_dne(struct ceph_osd_linger_request *lreq) { struct ceph_osd_client *osdc = lreq->osdc; struct ceph_osdmap *map = osdc->osdmap; verify_osdc_wrlocked(osdc); WARN_ON(!map->epoch); if (lreq->register_gen) { lreq->map_dne_bound = map->epoch; dout("%s lreq %p linger_id %llu pool disappeared\n", __func__, lreq, lreq->linger_id); } else { dout("%s lreq %p linger_id %llu map_dne_bound %u have %u\n", __func__, lreq, lreq->linger_id, lreq->map_dne_bound, map->epoch); } if (lreq->map_dne_bound) { if (map->epoch >= lreq->map_dne_bound) { /* we had a new enough map */ pr_info("linger_id %llu pool does not exist\n", lreq->linger_id); linger_reg_commit_complete(lreq, -ENOENT); __linger_cancel(lreq); } } else { send_linger_map_check(lreq); } } static void linger_map_check_cb(struct ceph_mon_generic_request *greq) { struct ceph_osd_client *osdc = &greq->monc->client->osdc; struct ceph_osd_linger_request *lreq; u64 linger_id = greq->private_data; WARN_ON(greq->result || !greq->u.newest); down_write(&osdc->lock); lreq = lookup_linger_mc(&osdc->linger_map_checks, linger_id); if (!lreq) { dout("%s linger_id %llu dne\n", __func__, linger_id); goto out_unlock; } dout("%s lreq %p linger_id %llu map_dne_bound %u newest %llu\n", __func__, lreq, lreq->linger_id, lreq->map_dne_bound, greq->u.newest); if (!lreq->map_dne_bound) lreq->map_dne_bound = greq->u.newest; erase_linger_mc(&osdc->linger_map_checks, lreq); check_linger_pool_dne(lreq); linger_put(lreq); out_unlock: up_write(&osdc->lock); } static void send_linger_map_check(struct ceph_osd_linger_request *lreq) { struct ceph_osd_client *osdc = lreq->osdc; struct ceph_osd_linger_request *lookup_lreq; int ret; verify_osdc_wrlocked(osdc); lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks, lreq->linger_id); if (lookup_lreq) { WARN_ON(lookup_lreq != lreq); return; } linger_get(lreq); insert_linger_mc(&osdc->linger_map_checks, lreq); ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap", linger_map_check_cb, lreq->linger_id); WARN_ON(ret); } static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq) { int ret; dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id); ret = wait_for_completion_killable(&lreq->reg_commit_wait); return ret ?: lreq->reg_commit_error; } static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq, unsigned long timeout) { long left; dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id); left = wait_for_completion_killable_timeout(&lreq->notify_finish_wait, ceph_timeout_jiffies(timeout)); if (left <= 0) left = left ?: -ETIMEDOUT; else left = lreq->notify_finish_error; /* completed */ return left; } /* * Timeout callback, called every N seconds. When 1 or more OSD * requests has been active for more than N seconds, we send a keepalive * (tag + timestamp) to its OSD to ensure any communications channel * reset is detected. */ static void handle_timeout(struct work_struct *work) { struct ceph_osd_client *osdc = container_of(work, struct ceph_osd_client, timeout_work.work); struct ceph_options *opts = osdc->client->options; unsigned long cutoff = jiffies - opts->osd_keepalive_timeout; unsigned long expiry_cutoff = jiffies - opts->osd_request_timeout; LIST_HEAD(slow_osds); struct rb_node *n, *p; dout("%s osdc %p\n", __func__, osdc); down_write(&osdc->lock); /* * ping osds that are a bit slow. this ensures that if there * is a break in the TCP connection we will notice, and reopen * a connection with that osd (from the fault callback). */ for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); bool found = false; for (p = rb_first(&osd->o_requests); p; ) { struct ceph_osd_request *req = rb_entry(p, struct ceph_osd_request, r_node); p = rb_next(p); /* abort_request() */ if (time_before(req->r_stamp, cutoff)) { dout(" req %p tid %llu on osd%d is laggy\n", req, req->r_tid, osd->o_osd); found = true; } if (opts->osd_request_timeout && time_before(req->r_start_stamp, expiry_cutoff)) { pr_err_ratelimited("tid %llu on osd%d timeout\n", req->r_tid, osd->o_osd); abort_request(req, -ETIMEDOUT); } } for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) { struct ceph_osd_linger_request *lreq = rb_entry(p, struct ceph_osd_linger_request, node); dout(" lreq %p linger_id %llu is served by osd%d\n", lreq, lreq->linger_id, osd->o_osd); found = true; mutex_lock(&lreq->lock); if (lreq->is_watch && lreq->committed && !lreq->last_error) send_linger_ping(lreq); mutex_unlock(&lreq->lock); } if (found) list_move_tail(&osd->o_keepalive_item, &slow_osds); } if (opts->osd_request_timeout) { for (p = rb_first(&osdc->homeless_osd.o_requests); p; ) { struct ceph_osd_request *req = rb_entry(p, struct ceph_osd_request, r_node); p = rb_next(p); /* abort_request() */ if (time_before(req->r_start_stamp, expiry_cutoff)) { pr_err_ratelimited("tid %llu on osd%d timeout\n", req->r_tid, osdc->homeless_osd.o_osd); abort_request(req, -ETIMEDOUT); } } } if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds)) maybe_request_map(osdc); while (!list_empty(&slow_osds)) { struct ceph_osd *osd = list_first_entry(&slow_osds, struct ceph_osd, o_keepalive_item); list_del_init(&osd->o_keepalive_item); ceph_con_keepalive(&osd->o_con); } up_write(&osdc->lock); schedule_delayed_work(&osdc->timeout_work, osdc->client->options->osd_keepalive_timeout); } static void handle_osds_timeout(struct work_struct *work) { struct ceph_osd_client *osdc = container_of(work, struct ceph_osd_client, osds_timeout_work.work); unsigned long delay = osdc->client->options->osd_idle_ttl / 4; struct ceph_osd *osd, *nosd; dout("%s osdc %p\n", __func__, osdc); down_write(&osdc->lock); list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) { if (time_before(jiffies, osd->lru_ttl)) break; WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests)); WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests)); close_osd(osd); } up_write(&osdc->lock); schedule_delayed_work(&osdc->osds_timeout_work, round_jiffies_relative(delay)); } static int ceph_oloc_decode(void **p, void *end, struct ceph_object_locator *oloc) { u8 struct_v, struct_cv; u32 len; void *struct_end; int ret = 0; ceph_decode_need(p, end, 1 + 1 + 4, e_inval); struct_v = ceph_decode_8(p); struct_cv = ceph_decode_8(p); if (struct_v < 3) { pr_warn("got v %d < 3 cv %d of ceph_object_locator\n", struct_v, struct_cv); goto e_inval; } if (struct_cv > 6) { pr_warn("got v %d cv %d > 6 of ceph_object_locator\n", struct_v, struct_cv); goto e_inval; } len = ceph_decode_32(p); ceph_decode_need(p, end, len, e_inval); struct_end = *p + len; oloc->pool = ceph_decode_64(p); *p += 4; /* skip preferred */ len = ceph_decode_32(p); if (len > 0) { pr_warn("ceph_object_locator::key is set\n"); goto e_inval; } if (struct_v >= 5) { bool changed = false; len = ceph_decode_32(p); if (len > 0) { ceph_decode_need(p, end, len, e_inval); if (!oloc->pool_ns || ceph_compare_string(oloc->pool_ns, *p, len)) changed = true; *p += len; } else { if (oloc->pool_ns) changed = true; } if (changed) { /* redirect changes namespace */ pr_warn("ceph_object_locator::nspace is changed\n"); goto e_inval; } } if (struct_v >= 6) { s64 hash = ceph_decode_64(p); if (hash != -1) { pr_warn("ceph_object_locator::hash is set\n"); goto e_inval; } } /* skip the rest */ *p = struct_end; out: return ret; e_inval: ret = -EINVAL; goto out; } static int ceph_redirect_decode(void **p, void *end, struct ceph_request_redirect *redir) { u8 struct_v, struct_cv; u32 len; void *struct_end; int ret; ceph_decode_need(p, end, 1 + 1 + 4, e_inval); struct_v = ceph_decode_8(p); struct_cv = ceph_decode_8(p); if (struct_cv > 1) { pr_warn("got v %d cv %d > 1 of ceph_request_redirect\n", struct_v, struct_cv); goto e_inval; } len = ceph_decode_32(p); ceph_decode_need(p, end, len, e_inval); struct_end = *p + len; ret = ceph_oloc_decode(p, end, &redir->oloc); if (ret) goto out; len = ceph_decode_32(p); if (len > 0) { pr_warn("ceph_request_redirect::object_name is set\n"); goto e_inval; } /* skip the rest */ *p = struct_end; out: return ret; e_inval: ret = -EINVAL; goto out; } struct MOSDOpReply { struct ceph_pg pgid; u64 flags; int result; u32 epoch; int num_ops; u32 outdata_len[CEPH_OSD_MAX_OPS]; s32 rval[CEPH_OSD_MAX_OPS]; int retry_attempt; struct ceph_eversion replay_version; u64 user_version; struct ceph_request_redirect redirect; }; static int decode_MOSDOpReply(const struct ceph_msg *msg, struct MOSDOpReply *m) { void *p = msg->front.iov_base; void *const end = p + msg->front.iov_len; u16 version = le16_to_cpu(msg->hdr.version); struct ceph_eversion bad_replay_version; u8 decode_redir; u32 len; int ret; int i; ceph_decode_32_safe(&p, end, len, e_inval); ceph_decode_need(&p, end, len, e_inval); p += len; /* skip oid */ ret = ceph_decode_pgid(&p, end, &m->pgid); if (ret) return ret; ceph_decode_64_safe(&p, end, m->flags, e_inval); ceph_decode_32_safe(&p, end, m->result, e_inval); ceph_decode_need(&p, end, sizeof(bad_replay_version), e_inval); memcpy(&bad_replay_version, p, sizeof(bad_replay_version)); p += sizeof(bad_replay_version); ceph_decode_32_safe(&p, end, m->epoch, e_inval); ceph_decode_32_safe(&p, end, m->num_ops, e_inval); if (m->num_ops > ARRAY_SIZE(m->outdata_len)) goto e_inval; ceph_decode_need(&p, end, m->num_ops * sizeof(struct ceph_osd_op), e_inval); for (i = 0; i < m->num_ops; i++) { struct ceph_osd_op *op = p; m->outdata_len[i] = le32_to_cpu(op->payload_len); p += sizeof(*op); } ceph_decode_32_safe(&p, end, m->retry_attempt, e_inval); for (i = 0; i < m->num_ops; i++) ceph_decode_32_safe(&p, end, m->rval[i], e_inval); if (version >= 5) { ceph_decode_need(&p, end, sizeof(m->replay_version), e_inval); memcpy(&m->replay_version, p, sizeof(m->replay_version)); p += sizeof(m->replay_version); ceph_decode_64_safe(&p, end, m->user_version, e_inval); } else { m->replay_version = bad_replay_version; /* struct */ m->user_version = le64_to_cpu(m->replay_version.version); } if (version >= 6) { if (version >= 7) ceph_decode_8_safe(&p, end, decode_redir, e_inval); else decode_redir = 1; } else { decode_redir = 0; } if (decode_redir) { ret = ceph_redirect_decode(&p, end, &m->redirect); if (ret) return ret; } else { ceph_oloc_init(&m->redirect.oloc); } return 0; e_inval: return -EINVAL; } /* * Handle MOSDOpReply. Set ->r_result and call the callback if it is * specified. */ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) { struct ceph_osd_client *osdc = osd->o_osdc; struct ceph_osd_request *req; struct MOSDOpReply m; u64 tid = le64_to_cpu(msg->hdr.tid); u32 data_len = 0; int ret; int i; dout("%s msg %p tid %llu\n", __func__, msg, tid); down_read(&osdc->lock); if (!osd_registered(osd)) { dout("%s osd%d unknown\n", __func__, osd->o_osd); goto out_unlock_osdc; } WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num)); mutex_lock(&osd->lock); req = lookup_request(&osd->o_requests, tid); if (!req) { dout("%s osd%d tid %llu unknown\n", __func__, osd->o_osd, tid); goto out_unlock_session; } m.redirect.oloc.pool_ns = req->r_t.target_oloc.pool_ns; ret = decode_MOSDOpReply(msg, &m); m.redirect.oloc.pool_ns = NULL; if (ret) { pr_err("failed to decode MOSDOpReply for tid %llu: %d\n", req->r_tid, ret); ceph_msg_dump(msg); goto fail_request; } dout("%s req %p tid %llu flags 0x%llx pgid %llu.%x epoch %u attempt %d v %u'%llu uv %llu\n", __func__, req, req->r_tid, m.flags, m.pgid.pool, m.pgid.seed, m.epoch, m.retry_attempt, le32_to_cpu(m.replay_version.epoch), le64_to_cpu(m.replay_version.version), m.user_version); if (m.retry_attempt >= 0) { if (m.retry_attempt != req->r_attempts - 1) { dout("req %p tid %llu retry_attempt %d != %d, ignoring\n", req, req->r_tid, m.retry_attempt, req->r_attempts - 1); goto out_unlock_session; } } else { WARN_ON(1); /* MOSDOpReply v4 is assumed */ } if (!ceph_oloc_empty(&m.redirect.oloc)) { dout("req %p tid %llu redirect pool %lld\n", req, req->r_tid, m.redirect.oloc.pool); unlink_request(osd, req); mutex_unlock(&osd->lock); /* * Not ceph_oloc_copy() - changing pool_ns is not * supported. */ req->r_t.target_oloc.pool = m.redirect.oloc.pool; req->r_flags |= CEPH_OSD_FLAG_REDIRECTED | CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_IGNORE_CACHE; req->r_tid = 0; __submit_request(req, false); goto out_unlock_osdc; } if (m.result == -EAGAIN) { dout("req %p tid %llu EAGAIN\n", req, req->r_tid); unlink_request(osd, req); mutex_unlock(&osd->lock); /* * The object is missing on the replica or not (yet) * readable. Clear pgid to force a resend to the primary * via legacy_change. */ req->r_t.pgid.pool = 0; req->r_t.pgid.seed = 0; WARN_ON(!req->r_t.used_replica); req->r_flags &= ~(CEPH_OSD_FLAG_BALANCE_READS | CEPH_OSD_FLAG_LOCALIZE_READS); req->r_tid = 0; __submit_request(req, false); goto out_unlock_osdc; } if (m.num_ops != req->r_num_ops) { pr_err("num_ops %d != %d for tid %llu\n", m.num_ops, req->r_num_ops, req->r_tid); goto fail_request; } for (i = 0; i < req->r_num_ops; i++) { dout(" req %p tid %llu op %d rval %d len %u\n", req, req->r_tid, i, m.rval[i], m.outdata_len[i]); req->r_ops[i].rval = m.rval[i]; req->r_ops[i].outdata_len = m.outdata_len[i]; data_len += m.outdata_len[i]; } if (data_len != le32_to_cpu(msg->hdr.data_len)) { pr_err("sum of lens %u != %u for tid %llu\n", data_len, le32_to_cpu(msg->hdr.data_len), req->r_tid); goto fail_request; } dout("%s req %p tid %llu result %d data_len %u\n", __func__, req, req->r_tid, m.result, data_len); /* * Since we only ever request ONDISK, we should only ever get * one (type of) reply back. */ WARN_ON(!(m.flags & CEPH_OSD_FLAG_ONDISK)); req->r_version = m.user_version; req->r_result = m.result ?: data_len; finish_request(req); mutex_unlock(&osd->lock); up_read(&osdc->lock); __complete_request(req); return; fail_request: complete_request(req, -EIO); out_unlock_session: mutex_unlock(&osd->lock); out_unlock_osdc: up_read(&osdc->lock); } static void set_pool_was_full(struct ceph_osd_client *osdc) { struct rb_node *n; for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) { struct ceph_pg_pool_info *pi = rb_entry(n, struct ceph_pg_pool_info, node); pi->was_full = __pool_full(pi); } } static bool pool_cleared_full(struct ceph_osd_client *osdc, s64 pool_id) { struct ceph_pg_pool_info *pi; pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id); if (!pi) return false; return pi->was_full && !__pool_full(pi); } static enum calc_target_result recalc_linger_target(struct ceph_osd_linger_request *lreq) { struct ceph_osd_client *osdc = lreq->osdc; enum calc_target_result ct_res; ct_res = calc_target(osdc, &lreq->t, true); if (ct_res == CALC_TARGET_NEED_RESEND) { struct ceph_osd *osd; osd = lookup_create_osd(osdc, lreq->t.osd, true); if (osd != lreq->osd) { unlink_linger(lreq->osd, lreq); link_linger(osd, lreq); } } return ct_res; } /* * Requeue requests whose mapping to an OSD has changed. */ static void scan_requests(struct ceph_osd *osd, bool force_resend, bool cleared_full, bool check_pool_cleared_full, struct rb_root *need_resend, struct list_head *need_resend_linger) { struct ceph_osd_client *osdc = osd->o_osdc; struct rb_node *n; bool force_resend_writes; for (n = rb_first(&osd->o_linger_requests); n; ) { struct ceph_osd_linger_request *lreq = rb_entry(n, struct ceph_osd_linger_request, node); enum calc_target_result ct_res; n = rb_next(n); /* recalc_linger_target() */ dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id); ct_res = recalc_linger_target(lreq); switch (ct_res) { case CALC_TARGET_NO_ACTION: force_resend_writes = cleared_full || (check_pool_cleared_full && pool_cleared_full(osdc, lreq->t.base_oloc.pool)); if (!force_resend && !force_resend_writes) break; fallthrough; case CALC_TARGET_NEED_RESEND: cancel_linger_map_check(lreq); /* * scan_requests() for the previous epoch(s) * may have already added it to the list, since * it's not unlinked here. */ if (list_empty(&lreq->scan_item)) list_add_tail(&lreq->scan_item, need_resend_linger); break; case CALC_TARGET_POOL_DNE: list_del_init(&lreq->scan_item); check_linger_pool_dne(lreq); break; } } for (n = rb_first(&osd->o_requests); n; ) { struct ceph_osd_request *req = rb_entry(n, struct ceph_osd_request, r_node); enum calc_target_result ct_res; n = rb_next(n); /* unlink_request(), check_pool_dne() */ dout("%s req %p tid %llu\n", __func__, req, req->r_tid); ct_res = calc_target(osdc, &req->r_t, false); switch (ct_res) { case CALC_TARGET_NO_ACTION: force_resend_writes = cleared_full || (check_pool_cleared_full && pool_cleared_full(osdc, req->r_t.base_oloc.pool)); if (!force_resend && (!(req->r_flags & CEPH_OSD_FLAG_WRITE) || !force_resend_writes)) break; fallthrough; case CALC_TARGET_NEED_RESEND: cancel_map_check(req); unlink_request(osd, req); insert_request(need_resend, req); break; case CALC_TARGET_POOL_DNE: check_pool_dne(req); break; } } } static int handle_one_map(struct ceph_osd_client *osdc, void *p, void *end, bool incremental, struct rb_root *need_resend, struct list_head *need_resend_linger) { struct ceph_osdmap *newmap; struct rb_node *n; bool skipped_map = false; bool was_full; was_full = ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL); set_pool_was_full(osdc); if (incremental) newmap = osdmap_apply_incremental(&p, end, ceph_msgr2(osdc->client), osdc->osdmap); else newmap = ceph_osdmap_decode(&p, end, ceph_msgr2(osdc->client)); if (IS_ERR(newmap)) return PTR_ERR(newmap); if (newmap != osdc->osdmap) { /* * Preserve ->was_full before destroying the old map. * For pools that weren't in the old map, ->was_full * should be false. */ for (n = rb_first(&newmap->pg_pools); n; n = rb_next(n)) { struct ceph_pg_pool_info *pi = rb_entry(n, struct ceph_pg_pool_info, node); struct ceph_pg_pool_info *old_pi; old_pi = ceph_pg_pool_by_id(osdc->osdmap, pi->id); if (old_pi) pi->was_full = old_pi->was_full; else WARN_ON(pi->was_full); } if (osdc->osdmap->epoch && osdc->osdmap->epoch + 1 < newmap->epoch) { WARN_ON(incremental); skipped_map = true; } ceph_osdmap_destroy(osdc->osdmap); osdc->osdmap = newmap; } was_full &= !ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL); scan_requests(&osdc->homeless_osd, skipped_map, was_full, true, need_resend, need_resend_linger); for (n = rb_first(&osdc->osds); n; ) { struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); n = rb_next(n); /* close_osd() */ scan_requests(osd, skipped_map, was_full, true, need_resend, need_resend_linger); if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) || memcmp(&osd->o_con.peer_addr, ceph_osd_addr(osdc->osdmap, osd->o_osd), sizeof(struct ceph_entity_addr))) close_osd(osd); } return 0; } static void kick_requests(struct ceph_osd_client *osdc, struct rb_root *need_resend, struct list_head *need_resend_linger) { struct ceph_osd_linger_request *lreq, *nlreq; enum calc_target_result ct_res; struct rb_node *n; /* make sure need_resend targets reflect latest map */ for (n = rb_first(need_resend); n; ) { struct ceph_osd_request *req = rb_entry(n, struct ceph_osd_request, r_node); n = rb_next(n); if (req->r_t.epoch < osdc->osdmap->epoch) { ct_res = calc_target(osdc, &req->r_t, false); if (ct_res == CALC_TARGET_POOL_DNE) { erase_request(need_resend, req); check_pool_dne(req); } } } for (n = rb_first(need_resend); n; ) { struct ceph_osd_request *req = rb_entry(n, struct ceph_osd_request, r_node); struct ceph_osd *osd; n = rb_next(n); erase_request(need_resend, req); /* before link_request() */ osd = lookup_create_osd(osdc, req->r_t.osd, true); link_request(osd, req); if (!req->r_linger) { if (!osd_homeless(osd) && !req->r_t.paused) send_request(req); } else { cancel_linger_request(req); } } list_for_each_entry_safe(lreq, nlreq, need_resend_linger, scan_item) { if (!osd_homeless(lreq->osd)) send_linger(lreq); list_del_init(&lreq->scan_item); } } /* * Process updated osd map. * * The message contains any number of incremental and full maps, normally * indicating some sort of topology change in the cluster. Kick requests * off to different OSDs as needed. */ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) { void *p = msg->front.iov_base; void *const end = p + msg->front.iov_len; u32 nr_maps, maplen; u32 epoch; struct ceph_fsid fsid; struct rb_root need_resend = RB_ROOT; LIST_HEAD(need_resend_linger); bool handled_incremental = false; bool was_pauserd, was_pausewr; bool pauserd, pausewr; int err; dout("%s have %u\n", __func__, osdc->osdmap->epoch); down_write(&osdc->lock); /* verify fsid */ ceph_decode_need(&p, end, sizeof(fsid), bad); ceph_decode_copy(&p, &fsid, sizeof(fsid)); if (ceph_check_fsid(osdc->client, &fsid) < 0) goto bad; was_pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD); was_pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) || ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || have_pool_full(osdc); /* incremental maps */ ceph_decode_32_safe(&p, end, nr_maps, bad); dout(" %d inc maps\n", nr_maps); while (nr_maps > 0) { ceph_decode_need(&p, end, 2*sizeof(u32), bad); epoch = ceph_decode_32(&p); maplen = ceph_decode_32(&p); ceph_decode_need(&p, end, maplen, bad); if (osdc->osdmap->epoch && osdc->osdmap->epoch + 1 == epoch) { dout("applying incremental map %u len %d\n", epoch, maplen); err = handle_one_map(osdc, p, p + maplen, true, &need_resend, &need_resend_linger); if (err) goto bad; handled_incremental = true; } else { dout("ignoring incremental map %u len %d\n", epoch, maplen); } p += maplen; nr_maps--; } if (handled_incremental) goto done; /* full maps */ ceph_decode_32_safe(&p, end, nr_maps, bad); dout(" %d full maps\n", nr_maps); while (nr_maps) { ceph_decode_need(&p, end, 2*sizeof(u32), bad); epoch = ceph_decode_32(&p); maplen = ceph_decode_32(&p); ceph_decode_need(&p, end, maplen, bad); if (nr_maps > 1) { dout("skipping non-latest full map %u len %d\n", epoch, maplen); } else if (osdc->osdmap->epoch >= epoch) { dout("skipping full map %u len %d, " "older than our %u\n", epoch, maplen, osdc->osdmap->epoch); } else { dout("taking full map %u len %d\n", epoch, maplen); err = handle_one_map(osdc, p, p + maplen, false, &need_resend, &need_resend_linger); if (err) goto bad; } p += maplen; nr_maps--; } done: /* * subscribe to subsequent osdmap updates if full to ensure * we find out when we are no longer full and stop returning * ENOSPC. */ pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD); pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) || ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || have_pool_full(osdc); if (was_pauserd || was_pausewr || pauserd || pausewr || osdc->osdmap->epoch < osdc->epoch_barrier) maybe_request_map(osdc); kick_requests(osdc, &need_resend, &need_resend_linger); ceph_osdc_abort_on_full(osdc); ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP, osdc->osdmap->epoch); up_write(&osdc->lock); wake_up_all(&osdc->client->auth_wq); return; bad: pr_err("osdc handle_map corrupt msg\n"); ceph_msg_dump(msg); up_write(&osdc->lock); } /* * Resubmit requests pending on the given osd. */ static void kick_osd_requests(struct ceph_osd *osd) { struct rb_node *n; clear_backoffs(osd); for (n = rb_first(&osd->o_requests); n; ) { struct ceph_osd_request *req = rb_entry(n, struct ceph_osd_request, r_node); n = rb_next(n); /* cancel_linger_request() */ if (!req->r_linger) { if (!req->r_t.paused) send_request(req); } else { cancel_linger_request(req); } } for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) { struct ceph_osd_linger_request *lreq = rb_entry(n, struct ceph_osd_linger_request, node); send_linger(lreq); } } /* * If the osd connection drops, we need to resubmit all requests. */ static void osd_fault(struct ceph_connection *con) { struct ceph_osd *osd = con->private; struct ceph_osd_client *osdc = osd->o_osdc; dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd); down_write(&osdc->lock); if (!osd_registered(osd)) { dout("%s osd%d unknown\n", __func__, osd->o_osd); goto out_unlock; } if (!reopen_osd(osd)) kick_osd_requests(osd); maybe_request_map(osdc); out_unlock: up_write(&osdc->lock); } struct MOSDBackoff { struct ceph_spg spgid; u32 map_epoch; u8 op; u64 id; struct ceph_hobject_id *begin; struct ceph_hobject_id *end; }; static int decode_MOSDBackoff(const struct ceph_msg *msg, struct MOSDBackoff *m) { void *p = msg->front.iov_base; void *const end = p + msg->front.iov_len; u8 struct_v; u32 struct_len; int ret; ret = ceph_start_decoding(&p, end, 1, "spg_t", &struct_v, &struct_len); if (ret) return ret; ret = ceph_decode_pgid(&p, end, &m->spgid.pgid); if (ret) return ret; ceph_decode_8_safe(&p, end, m->spgid.shard, e_inval); ceph_decode_32_safe(&p, end, m->map_epoch, e_inval); ceph_decode_8_safe(&p, end, m->op, e_inval); ceph_decode_64_safe(&p, end, m->id, e_inval); m->begin = kzalloc(sizeof(*m->begin), GFP_NOIO); if (!m->begin) return -ENOMEM; ret = decode_hoid(&p, end, m->begin); if (ret) { free_hoid(m->begin); return ret; } m->end = kzalloc(sizeof(*m->end), GFP_NOIO); if (!m->end) { free_hoid(m->begin); return -ENOMEM; } ret = decode_hoid(&p, end, m->end); if (ret) { free_hoid(m->begin); free_hoid(m->end); return ret; } return 0; e_inval: return -EINVAL; } static struct ceph_msg *create_backoff_message( const struct ceph_osd_backoff *backoff, u32 map_epoch) { struct ceph_msg *msg; void *p, *end; int msg_size; msg_size = CEPH_ENCODING_START_BLK_LEN + CEPH_PGID_ENCODING_LEN + 1; /* spgid */ msg_size += 4 + 1 + 8; /* map_epoch, op, id */ msg_size += CEPH_ENCODING_START_BLK_LEN + hoid_encoding_size(backoff->begin); msg_size += CEPH_ENCODING_START_BLK_LEN + hoid_encoding_size(backoff->end); msg = ceph_msg_new(CEPH_MSG_OSD_BACKOFF, msg_size, GFP_NOIO, true); if (!msg) return NULL; p = msg->front.iov_base; end = p + msg->front_alloc_len; encode_spgid(&p, &backoff->spgid); ceph_encode_32(&p, map_epoch); ceph_encode_8(&p, CEPH_OSD_BACKOFF_OP_ACK_BLOCK); ceph_encode_64(&p, backoff->id); encode_hoid(&p, end, backoff->begin); encode_hoid(&p, end, backoff->end); BUG_ON(p != end); msg->front.iov_len = p - msg->front.iov_base; msg->hdr.version = cpu_to_le16(1); /* MOSDBackoff v1 */ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); return msg; } static void handle_backoff_block(struct ceph_osd *osd, struct MOSDBackoff *m) { struct ceph_spg_mapping *spg; struct ceph_osd_backoff *backoff; struct ceph_msg *msg; dout("%s osd%d spgid %llu.%xs%d id %llu\n", __func__, osd->o_osd, m->spgid.pgid.pool, m->spgid.pgid.seed, m->spgid.shard, m->id); spg = lookup_spg_mapping(&osd->o_backoff_mappings, &m->spgid); if (!spg) { spg = alloc_spg_mapping(); if (!spg) { pr_err("%s failed to allocate spg\n", __func__); return; } spg->spgid = m->spgid; /* struct */ insert_spg_mapping(&osd->o_backoff_mappings, spg); } backoff = alloc_backoff(); if (!backoff) { pr_err("%s failed to allocate backoff\n", __func__); return; } backoff->spgid = m->spgid; /* struct */ backoff->id = m->id; backoff->begin = m->begin; m->begin = NULL; /* backoff now owns this */ backoff->end = m->end; m->end = NULL; /* ditto */ insert_backoff(&spg->backoffs, backoff); insert_backoff_by_id(&osd->o_backoffs_by_id, backoff); /* * Ack with original backoff's epoch so that the OSD can * discard this if there was a PG split. */ msg = create_backoff_message(backoff, m->map_epoch); if (!msg) { pr_err("%s failed to allocate msg\n", __func__); return; } ceph_con_send(&osd->o_con, msg); } static bool target_contained_by(const struct ceph_osd_request_target *t, const struct ceph_hobject_id *begin, const struct ceph_hobject_id *end) { struct ceph_hobject_id hoid; int cmp; hoid_fill_from_target(&hoid, t); cmp = hoid_compare(&hoid, begin); return !cmp || (cmp > 0 && hoid_compare(&hoid, end) < 0); } static void handle_backoff_unblock(struct ceph_osd *osd, const struct MOSDBackoff *m) { struct ceph_spg_mapping *spg; struct ceph_osd_backoff *backoff; struct rb_node *n; dout("%s osd%d spgid %llu.%xs%d id %llu\n", __func__, osd->o_osd, m->spgid.pgid.pool, m->spgid.pgid.seed, m->spgid.shard, m->id); backoff = lookup_backoff_by_id(&osd->o_backoffs_by_id, m->id); if (!backoff) { pr_err("%s osd%d spgid %llu.%xs%d id %llu backoff dne\n", __func__, osd->o_osd, m->spgid.pgid.pool, m->spgid.pgid.seed, m->spgid.shard, m->id); return; } if (hoid_compare(backoff->begin, m->begin) && hoid_compare(backoff->end, m->end)) { pr_err("%s osd%d spgid %llu.%xs%d id %llu bad range?\n", __func__, osd->o_osd, m->spgid.pgid.pool, m->spgid.pgid.seed, m->spgid.shard, m->id); /* unblock it anyway... */ } spg = lookup_spg_mapping(&osd->o_backoff_mappings, &backoff->spgid); BUG_ON(!spg); erase_backoff(&spg->backoffs, backoff); erase_backoff_by_id(&osd->o_backoffs_by_id, backoff); free_backoff(backoff); if (RB_EMPTY_ROOT(&spg->backoffs)) { erase_spg_mapping(&osd->o_backoff_mappings, spg); free_spg_mapping(spg); } for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) { struct ceph_osd_request *req = rb_entry(n, struct ceph_osd_request, r_node); if (!ceph_spg_compare(&req->r_t.spgid, &m->spgid)) { /* * Match against @m, not @backoff -- the PG may * have split on the OSD. */ if (target_contained_by(&req->r_t, m->begin, m->end)) { /* * If no other installed backoff applies, * resend. */ send_request(req); } } } } static void handle_backoff(struct ceph_osd *osd, struct ceph_msg *msg) { struct ceph_osd_client *osdc = osd->o_osdc; struct MOSDBackoff m; int ret; down_read(&osdc->lock); if (!osd_registered(osd)) { dout("%s osd%d unknown\n", __func__, osd->o_osd); up_read(&osdc->lock); return; } WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num)); mutex_lock(&osd->lock); ret = decode_MOSDBackoff(msg, &m); if (ret) { pr_err("failed to decode MOSDBackoff: %d\n", ret); ceph_msg_dump(msg); goto out_unlock; } switch (m.op) { case CEPH_OSD_BACKOFF_OP_BLOCK: handle_backoff_block(osd, &m); break; case CEPH_OSD_BACKOFF_OP_UNBLOCK: handle_backoff_unblock(osd, &m); break; default: pr_err("%s osd%d unknown op %d\n", __func__, osd->o_osd, m.op); } free_hoid(m.begin); free_hoid(m.end); out_unlock: mutex_unlock(&osd->lock); up_read(&osdc->lock); } /* * Process osd watch notifications */ static void handle_watch_notify(struct ceph_osd_client *osdc, struct ceph_msg *msg) { void *p = msg->front.iov_base; void *const end = p + msg->front.iov_len; struct ceph_osd_linger_request *lreq; struct linger_work *lwork; u8 proto_ver, opcode; u64 cookie, notify_id; u64 notifier_id = 0; s32 return_code = 0; void *payload = NULL; u32 payload_len = 0; ceph_decode_8_safe(&p, end, proto_ver, bad); ceph_decode_8_safe(&p, end, opcode, bad); ceph_decode_64_safe(&p, end, cookie, bad); p += 8; /* skip ver */ ceph_decode_64_safe(&p, end, notify_id, bad); if (proto_ver >= 1) { ceph_decode_32_safe(&p, end, payload_len, bad); ceph_decode_need(&p, end, payload_len, bad); payload = p; p += payload_len; } if (le16_to_cpu(msg->hdr.version) >= 2) ceph_decode_32_safe(&p, end, return_code, bad); if (le16_to_cpu(msg->hdr.version) >= 3) ceph_decode_64_safe(&p, end, notifier_id, bad); down_read(&osdc->lock); lreq = lookup_linger_osdc(&osdc->linger_requests, cookie); if (!lreq) { dout("%s opcode %d cookie %llu dne\n", __func__, opcode, cookie); goto out_unlock_osdc; } mutex_lock(&lreq->lock); dout("%s opcode %d cookie %llu lreq %p is_watch %d\n", __func__, opcode, cookie, lreq, lreq->is_watch); if (opcode == CEPH_WATCH_EVENT_DISCONNECT) { if (!lreq->last_error) { lreq->last_error = -ENOTCONN; queue_watch_error(lreq); } } else if (!lreq->is_watch) { /* CEPH_WATCH_EVENT_NOTIFY_COMPLETE */ if (lreq->notify_id && lreq->notify_id != notify_id) { dout("lreq %p notify_id %llu != %llu, ignoring\n", lreq, lreq->notify_id, notify_id); } else if (!completion_done(&lreq->notify_finish_wait)) { struct ceph_msg_data *data = msg->num_data_items ? &msg->data[0] : NULL; if (data) { if (lreq->preply_pages) { WARN_ON(data->type != CEPH_MSG_DATA_PAGES); *lreq->preply_pages = data->pages; *lreq->preply_len = data->length; data->own_pages = false; } } lreq->notify_finish_error = return_code; complete_all(&lreq->notify_finish_wait); } } else { /* CEPH_WATCH_EVENT_NOTIFY */ lwork = lwork_alloc(lreq, do_watch_notify); if (!lwork) { pr_err("failed to allocate notify-lwork\n"); goto out_unlock_lreq; } lwork->notify.notify_id = notify_id; lwork->notify.notifier_id = notifier_id; lwork->notify.payload = payload; lwork->notify.payload_len = payload_len; lwork->notify.msg = ceph_msg_get(msg); lwork_queue(lwork); } out_unlock_lreq: mutex_unlock(&lreq->lock); out_unlock_osdc: up_read(&osdc->lock); return; bad: pr_err("osdc handle_watch_notify corrupt msg\n"); } /* * Register request, send initial attempt. */ void ceph_osdc_start_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req) { down_read(&osdc->lock); submit_request(req, false); up_read(&osdc->lock); } EXPORT_SYMBOL(ceph_osdc_start_request); /* * Unregister request. If @req was registered, it isn't completed: * r_result isn't set and __complete_request() isn't invoked. * * If @req wasn't registered, this call may have raced with * handle_reply(), in which case r_result would already be set and * __complete_request() would be getting invoked, possibly even * concurrently with this call. */ void ceph_osdc_cancel_request(struct ceph_osd_request *req) { struct ceph_osd_client *osdc = req->r_osdc; down_write(&osdc->lock); if (req->r_osd) cancel_request(req); up_write(&osdc->lock); } EXPORT_SYMBOL(ceph_osdc_cancel_request); /* * @timeout: in jiffies, 0 means "wait forever" */ static int wait_request_timeout(struct ceph_osd_request *req, unsigned long timeout) { long left; dout("%s req %p tid %llu\n", __func__, req, req->r_tid); left = wait_for_completion_killable_timeout(&req->r_completion, ceph_timeout_jiffies(timeout)); if (left <= 0) { left = left ?: -ETIMEDOUT; ceph_osdc_cancel_request(req); } else { left = req->r_result; /* completed */ } return left; } /* * wait for a request to complete */ int ceph_osdc_wait_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req) { return wait_request_timeout(req, 0); } EXPORT_SYMBOL(ceph_osdc_wait_request); /* * sync - wait for all in-flight requests to flush. avoid starvation. */ void ceph_osdc_sync(struct ceph_osd_client *osdc) { struct rb_node *n, *p; u64 last_tid = atomic64_read(&osdc->last_tid); again: down_read(&osdc->lock); for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); mutex_lock(&osd->lock); for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) { struct ceph_osd_request *req = rb_entry(p, struct ceph_osd_request, r_node); if (req->r_tid > last_tid) break; if (!(req->r_flags & CEPH_OSD_FLAG_WRITE)) continue; ceph_osdc_get_request(req); mutex_unlock(&osd->lock); up_read(&osdc->lock); dout("%s waiting on req %p tid %llu last_tid %llu\n", __func__, req, req->r_tid, last_tid); wait_for_completion(&req->r_completion); ceph_osdc_put_request(req); goto again; } mutex_unlock(&osd->lock); } up_read(&osdc->lock); dout("%s done last_tid %llu\n", __func__, last_tid); } EXPORT_SYMBOL(ceph_osdc_sync); /* * Returns a handle, caller owns a ref. */ struct ceph_osd_linger_request * ceph_osdc_watch(struct ceph_osd_client *osdc, struct ceph_object_id *oid, struct ceph_object_locator *oloc, rados_watchcb2_t wcb, rados_watcherrcb_t errcb, void *data) { struct ceph_osd_linger_request *lreq; int ret; lreq = linger_alloc(osdc); if (!lreq) return ERR_PTR(-ENOMEM); lreq->is_watch = true; lreq->wcb = wcb; lreq->errcb = errcb; lreq->data = data; lreq->watch_valid_thru = jiffies; ceph_oid_copy(&lreq->t.base_oid, oid); ceph_oloc_copy(&lreq->t.base_oloc, oloc); lreq->t.flags = CEPH_OSD_FLAG_WRITE; ktime_get_real_ts64(&lreq->mtime); linger_submit(lreq); ret = linger_reg_commit_wait(lreq); if (ret) { linger_cancel(lreq); goto err_put_lreq; } return lreq; err_put_lreq: linger_put(lreq); return ERR_PTR(ret); } EXPORT_SYMBOL(ceph_osdc_watch); /* * Releases a ref. * * Times out after mount_timeout to preserve rbd unmap behaviour * introduced in 2894e1d76974 ("rbd: timeout watch teardown on unmap * with mount_timeout"). */ int ceph_osdc_unwatch(struct ceph_osd_client *osdc, struct ceph_osd_linger_request *lreq) { struct ceph_options *opts = osdc->client->options; struct ceph_osd_request *req; int ret; req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO); if (!req) return -ENOMEM; ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); req->r_flags = CEPH_OSD_FLAG_WRITE; ktime_get_real_ts64(&req->r_mtime); osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_UNWATCH, lreq->linger_id, 0); ret = ceph_osdc_alloc_messages(req, GFP_NOIO); if (ret) goto out_put_req; ceph_osdc_start_request(osdc, req); linger_cancel(lreq); linger_put(lreq); ret = wait_request_timeout(req, opts->mount_timeout); out_put_req: ceph_osdc_put_request(req); return ret; } EXPORT_SYMBOL(ceph_osdc_unwatch); static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which, u64 notify_id, u64 cookie, void *payload, u32 payload_len) { struct ceph_osd_req_op *op; struct ceph_pagelist *pl; int ret; op = osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0); pl = ceph_pagelist_alloc(GFP_NOIO); if (!pl) return -ENOMEM; ret = ceph_pagelist_encode_64(pl, notify_id); ret |= ceph_pagelist_encode_64(pl, cookie); if (payload) { ret |= ceph_pagelist_encode_32(pl, payload_len); ret |= ceph_pagelist_append(pl, payload, payload_len); } else { ret |= ceph_pagelist_encode_32(pl, 0); } if (ret) { ceph_pagelist_release(pl); return -ENOMEM; } ceph_osd_data_pagelist_init(&op->notify_ack.request_data, pl); op->indata_len = pl->length; return 0; } int ceph_osdc_notify_ack(struct ceph_osd_client *osdc, struct ceph_object_id *oid, struct ceph_object_locator *oloc, u64 notify_id, u64 cookie, void *payload, u32 payload_len) { struct ceph_osd_request *req; int ret; req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO); if (!req) return -ENOMEM; ceph_oid_copy(&req->r_base_oid, oid); ceph_oloc_copy(&req->r_base_oloc, oloc); req->r_flags = CEPH_OSD_FLAG_READ; ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload, payload_len); if (ret) goto out_put_req; ret = ceph_osdc_alloc_messages(req, GFP_NOIO); if (ret) goto out_put_req; ceph_osdc_start_request(osdc, req); ret = ceph_osdc_wait_request(osdc, req); out_put_req: ceph_osdc_put_request(req); return ret; } EXPORT_SYMBOL(ceph_osdc_notify_ack); /* * @timeout: in seconds * * @preply_{pages,len} are initialized both on success and error. * The caller is responsible for: * * ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len)) */ int ceph_osdc_notify(struct ceph_osd_client *osdc, struct ceph_object_id *oid, struct ceph_object_locator *oloc, void *payload, u32 payload_len, u32 timeout, struct page ***preply_pages, size_t *preply_len) { struct ceph_osd_linger_request *lreq; int ret; WARN_ON(!timeout); if (preply_pages) { *preply_pages = NULL; *preply_len = 0; } lreq = linger_alloc(osdc); if (!lreq) return -ENOMEM; lreq->request_pl = ceph_pagelist_alloc(GFP_NOIO); if (!lreq->request_pl) { ret = -ENOMEM; goto out_put_lreq; } ret = ceph_pagelist_encode_32(lreq->request_pl, 1); /* prot_ver */ ret |= ceph_pagelist_encode_32(lreq->request_pl, timeout); ret |= ceph_pagelist_encode_32(lreq->request_pl, payload_len); ret |= ceph_pagelist_append(lreq->request_pl, payload, payload_len); if (ret) { ret = -ENOMEM; goto out_put_lreq; } /* for notify_id */ lreq->notify_id_pages = ceph_alloc_page_vector(1, GFP_NOIO); if (IS_ERR(lreq->notify_id_pages)) { ret = PTR_ERR(lreq->notify_id_pages); lreq->notify_id_pages = NULL; goto out_put_lreq; } lreq->preply_pages = preply_pages; lreq->preply_len = preply_len; ceph_oid_copy(&lreq->t.base_oid, oid); ceph_oloc_copy(&lreq->t.base_oloc, oloc); lreq->t.flags = CEPH_OSD_FLAG_READ; linger_submit(lreq); ret = linger_reg_commit_wait(lreq); if (!ret) ret = linger_notify_finish_wait(lreq, msecs_to_jiffies(2 * timeout * MSEC_PER_SEC)); else dout("lreq %p failed to initiate notify %d\n", lreq, ret); linger_cancel(lreq); out_put_lreq: linger_put(lreq); return ret; } EXPORT_SYMBOL(ceph_osdc_notify); static int decode_watcher(void **p, void *end, struct ceph_watch_item *item) { u8 struct_v; u32 struct_len; int ret; ret = ceph_start_decoding(p, end, 2, "watch_item_t", &struct_v, &struct_len); if (ret) goto bad; ret = -EINVAL; ceph_decode_copy_safe(p, end, &item->name, sizeof(item->name), bad); ceph_decode_64_safe(p, end, item->cookie, bad); ceph_decode_skip_32(p, end, bad); /* skip timeout seconds */ if (struct_v >= 2) { ret = ceph_decode_entity_addr(p, end, &item->addr); if (ret) goto bad; } else { ret = 0; } dout("%s %s%llu cookie %llu addr %s\n", __func__, ENTITY_NAME(item->name), item->cookie, ceph_pr_addr(&item->addr)); bad: return ret; } static int decode_watchers(void **p, void *end, struct ceph_watch_item **watchers, u32 *num_watchers) { u8 struct_v; u32 struct_len; int i; int ret; ret = ceph_start_decoding(p, end, 1, "obj_list_watch_response_t", &struct_v, &struct_len); if (ret) return ret; *num_watchers = ceph_decode_32(p); *watchers = kcalloc(*num_watchers, sizeof(**watchers), GFP_NOIO); if (!*watchers) return -ENOMEM; for (i = 0; i < *num_watchers; i++) { ret = decode_watcher(p, end, *watchers + i); if (ret) { kfree(*watchers); return ret; } } return 0; } /* * On success, the caller is responsible for: * * kfree(watchers); */ int ceph_osdc_list_watchers(struct ceph_osd_client *osdc, struct ceph_object_id *oid, struct ceph_object_locator *oloc, struct ceph_watch_item **watchers, u32 *num_watchers) { struct ceph_osd_request *req; struct page **pages; int ret; req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO); if (!req) return -ENOMEM; ceph_oid_copy(&req->r_base_oid, oid); ceph_oloc_copy(&req->r_base_oloc, oloc); req->r_flags = CEPH_OSD_FLAG_READ; pages = ceph_alloc_page_vector(1, GFP_NOIO); if (IS_ERR(pages)) { ret = PTR_ERR(pages); goto out_put_req; } osd_req_op_init(req, 0, CEPH_OSD_OP_LIST_WATCHERS, 0); ceph_osd_data_pages_init(osd_req_op_data(req, 0, list_watchers, response_data), pages, PAGE_SIZE, 0, false, true); ret = ceph_osdc_alloc_messages(req, GFP_NOIO); if (ret) goto out_put_req; ceph_osdc_start_request(osdc, req); ret = ceph_osdc_wait_request(osdc, req); if (ret >= 0) { void *p = page_address(pages[0]); void *const end = p + req->r_ops[0].outdata_len; ret = decode_watchers(&p, end, watchers, num_watchers); } out_put_req: ceph_osdc_put_request(req); return ret; } EXPORT_SYMBOL(ceph_osdc_list_watchers); /* * Call all pending notify callbacks - for use after a watch is * unregistered, to make sure no more callbacks for it will be invoked */ void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc) { dout("%s osdc %p\n", __func__, osdc); flush_workqueue(osdc->notify_wq); } EXPORT_SYMBOL(ceph_osdc_flush_notifies); void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc) { down_read(&osdc->lock); maybe_request_map(osdc); up_read(&osdc->lock); } EXPORT_SYMBOL(ceph_osdc_maybe_request_map); /* * Execute an OSD class method on an object. * * @flags: CEPH_OSD_FLAG_* * @resp_len: in/out param for reply length */ int ceph_osdc_call(struct ceph_osd_client *osdc, struct ceph_object_id *oid, struct ceph_object_locator *oloc, const char *class, const char *method, unsigned int flags, struct page *req_page, size_t req_len, struct page **resp_pages, size_t *resp_len) { struct ceph_osd_request *req; int ret; if (req_len > PAGE_SIZE) return -E2BIG; req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO); if (!req) return -ENOMEM; ceph_oid_copy(&req->r_base_oid, oid); ceph_oloc_copy(&req->r_base_oloc, oloc); req->r_flags = flags; ret = osd_req_op_cls_init(req, 0, class, method); if (ret) goto out_put_req; if (req_page) osd_req_op_cls_request_data_pages(req, 0, &req_page, req_len, 0, false, false); if (resp_pages) osd_req_op_cls_response_data_pages(req, 0, resp_pages, *resp_len, 0, false, false); ret = ceph_osdc_alloc_messages(req, GFP_NOIO); if (ret) goto out_put_req; ceph_osdc_start_request(osdc, req); ret = ceph_osdc_wait_request(osdc, req); if (ret >= 0) { ret = req->r_ops[0].rval; if (resp_pages) *resp_len = req->r_ops[0].outdata_len; } out_put_req: ceph_osdc_put_request(req); return ret; } EXPORT_SYMBOL(ceph_osdc_call); /* * reset all osd connections */ void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc) { struct rb_node *n; down_write(&osdc->lock); for (n = rb_first(&osdc->osds); n; ) { struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); n = rb_next(n); if (!reopen_osd(osd)) kick_osd_requests(osd); } up_write(&osdc->lock); } /* * init, shutdown */ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) { int err; dout("init\n"); osdc->client = client; init_rwsem(&osdc->lock); osdc->osds = RB_ROOT; INIT_LIST_HEAD(&osdc->osd_lru); spin_lock_init(&osdc->osd_lru_lock); osd_init(&osdc->homeless_osd); osdc->homeless_osd.o_osdc = osdc; osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD; osdc->last_linger_id = CEPH_LINGER_ID_START; osdc->linger_requests = RB_ROOT; osdc->map_checks = RB_ROOT; osdc->linger_map_checks = RB_ROOT; INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout); INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); err = -ENOMEM; osdc->osdmap = ceph_osdmap_alloc(); if (!osdc->osdmap) goto out; osdc->req_mempool = mempool_create_slab_pool(10, ceph_osd_request_cache); if (!osdc->req_mempool) goto out_map; err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP, PAGE_SIZE, CEPH_OSD_SLAB_OPS, 10, "osd_op"); if (err < 0) goto out_mempool; err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY, PAGE_SIZE, CEPH_OSD_SLAB_OPS, 10, "osd_op_reply"); if (err < 0) goto out_msgpool; err = -ENOMEM; osdc->notify_wq = create_singlethread_workqueue("ceph-watch-notify"); if (!osdc->notify_wq) goto out_msgpool_reply; osdc->completion_wq = create_singlethread_workqueue("ceph-completion"); if (!osdc->completion_wq) goto out_notify_wq; schedule_delayed_work(&osdc->timeout_work, osdc->client->options->osd_keepalive_timeout); schedule_delayed_work(&osdc->osds_timeout_work, round_jiffies_relative(osdc->client->options->osd_idle_ttl)); return 0; out_notify_wq: destroy_workqueue(osdc->notify_wq); out_msgpool_reply: ceph_msgpool_destroy(&osdc->msgpool_op_reply); out_msgpool: ceph_msgpool_destroy(&osdc->msgpool_op); out_mempool: mempool_destroy(osdc->req_mempool); out_map: ceph_osdmap_destroy(osdc->osdmap); out: return err; } void ceph_osdc_stop(struct ceph_osd_client *osdc) { destroy_workqueue(osdc->completion_wq); destroy_workqueue(osdc->notify_wq); cancel_delayed_work_sync(&osdc->timeout_work); cancel_delayed_work_sync(&osdc->osds_timeout_work); down_write(&osdc->lock); while (!RB_EMPTY_ROOT(&osdc->osds)) { struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds), struct ceph_osd, o_node); close_osd(osd); } up_write(&osdc->lock); WARN_ON(refcount_read(&osdc->homeless_osd.o_ref) != 1); osd_cleanup(&osdc->homeless_osd); WARN_ON(!list_empty(&osdc->osd_lru)); WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_requests)); WARN_ON(!RB_EMPTY_ROOT(&osdc->map_checks)); WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_map_checks)); WARN_ON(atomic_read(&osdc->num_requests)); WARN_ON(atomic_read(&osdc->num_homeless)); ceph_osdmap_destroy(osdc->osdmap); mempool_destroy(osdc->req_mempool); ceph_msgpool_destroy(&osdc->msgpool_op); ceph_msgpool_destroy(&osdc->msgpool_op_reply); } int osd_req_op_copy_from_init(struct ceph_osd_request *req, u64 src_snapid, u64 src_version, struct ceph_object_id *src_oid, struct ceph_object_locator *src_oloc, u32 src_fadvise_flags, u32 dst_fadvise_flags, u32 truncate_seq, u64 truncate_size, u8 copy_from_flags) { struct ceph_osd_req_op *op; struct page **pages; void *p, *end; pages = ceph_alloc_page_vector(1, GFP_KERNEL); if (IS_ERR(pages)) return PTR_ERR(pages); op = osd_req_op_init(req, 0, CEPH_OSD_OP_COPY_FROM2, dst_fadvise_flags); op->copy_from.snapid = src_snapid; op->copy_from.src_version = src_version; op->copy_from.flags = copy_from_flags; op->copy_from.src_fadvise_flags = src_fadvise_flags; p = page_address(pages[0]); end = p + PAGE_SIZE; ceph_encode_string(&p, end, src_oid->name, src_oid->name_len); encode_oloc(&p, end, src_oloc); ceph_encode_32(&p, truncate_seq); ceph_encode_64(&p, truncate_size); op->indata_len = PAGE_SIZE - (end - p); ceph_osd_data_pages_init(&op->copy_from.osd_data, pages, op->indata_len, 0, false, true); return 0; } EXPORT_SYMBOL(osd_req_op_copy_from_init); int __init ceph_osdc_setup(void) { size_t size = sizeof(struct ceph_osd_request) + CEPH_OSD_SLAB_OPS * sizeof(struct ceph_osd_req_op); BUG_ON(ceph_osd_request_cache); ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", size, 0, 0, NULL); return ceph_osd_request_cache ? 0 : -ENOMEM; } void ceph_osdc_cleanup(void) { BUG_ON(!ceph_osd_request_cache); kmem_cache_destroy(ceph_osd_request_cache); ceph_osd_request_cache = NULL; } /* * handle incoming message */ static void osd_dispatch(struct ceph_connection *con, struct ceph_msg *msg) { struct ceph_osd *osd = con->private; struct ceph_osd_client *osdc = osd->o_osdc; int type = le16_to_cpu(msg->hdr.type); switch (type) { case CEPH_MSG_OSD_MAP: ceph_osdc_handle_map(osdc, msg); break; case CEPH_MSG_OSD_OPREPLY: handle_reply(osd, msg); break; case CEPH_MSG_OSD_BACKOFF: handle_backoff(osd, msg); break; case CEPH_MSG_WATCH_NOTIFY: handle_watch_notify(osdc, msg); break; default: pr_err("received unknown message type %d %s\n", type, ceph_msg_type_name(type)); } ceph_msg_put(msg); } /* How much sparse data was requested? */ static u64 sparse_data_requested(struct ceph_osd_request *req) { u64 len = 0; if (req->r_flags & CEPH_OSD_FLAG_READ) { int i; for (i = 0; i < req->r_num_ops; ++i) { struct ceph_osd_req_op *op = &req->r_ops[i]; if (op->op == CEPH_OSD_OP_SPARSE_READ) len += op->extent.length; } } return len; } /* * Lookup and return message for incoming reply. Don't try to do * anything about a larger than preallocated data portion of the * message at the moment - for now, just skip the message. */ static struct ceph_msg *get_reply(struct ceph_connection *con, struct ceph_msg_header *hdr, int *skip) { struct ceph_osd *osd = con->private; struct ceph_osd_client *osdc = osd->o_osdc; struct ceph_msg *m = NULL; struct ceph_osd_request *req; int front_len = le32_to_cpu(hdr->front_len); int data_len = le32_to_cpu(hdr->data_len); u64 tid = le64_to_cpu(hdr->tid); u64 srlen; down_read(&osdc->lock); if (!osd_registered(osd)) { dout("%s osd%d unknown, skipping\n", __func__, osd->o_osd); *skip = 1; goto out_unlock_osdc; } WARN_ON(osd->o_osd != le64_to_cpu(hdr->src.num)); mutex_lock(&osd->lock); req = lookup_request(&osd->o_requests, tid); if (!req) { dout("%s osd%d tid %llu unknown, skipping\n", __func__, osd->o_osd, tid); *skip = 1; goto out_unlock_session; } ceph_msg_revoke_incoming(req->r_reply); if (front_len > req->r_reply->front_alloc_len) { pr_warn("%s osd%d tid %llu front %d > preallocated %d\n", __func__, osd->o_osd, req->r_tid, front_len, req->r_reply->front_alloc_len); m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS, false); if (!m) goto out_unlock_session; ceph_msg_put(req->r_reply); req->r_reply = m; } srlen = sparse_data_requested(req); if (!srlen && data_len > req->r_reply->data_length) { pr_warn("%s osd%d tid %llu data %d > preallocated %zu, skipping\n", __func__, osd->o_osd, req->r_tid, data_len, req->r_reply->data_length); m = NULL; *skip = 1; goto out_unlock_session; } m = ceph_msg_get(req->r_reply); m->sparse_read_total = srlen; dout("get_reply tid %lld %p\n", tid, m); out_unlock_session: mutex_unlock(&osd->lock); out_unlock_osdc: up_read(&osdc->lock); return m; } static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr) { struct ceph_msg *m; int type = le16_to_cpu(hdr->type); u32 front_len = le32_to_cpu(hdr->front_len); u32 data_len = le32_to_cpu(hdr->data_len); m = ceph_msg_new2(type, front_len, 1, GFP_NOIO, false); if (!m) return NULL; if (data_len) { struct page **pages; pages = ceph_alloc_page_vector(calc_pages_for(0, data_len), GFP_NOIO); if (IS_ERR(pages)) { ceph_msg_put(m); return NULL; } ceph_msg_data_add_pages(m, pages, data_len, 0, true); } return m; } static struct ceph_msg *osd_alloc_msg(struct ceph_connection *con, struct ceph_msg_header *hdr, int *skip) { struct ceph_osd *osd = con->private; int type = le16_to_cpu(hdr->type); *skip = 0; switch (type) { case CEPH_MSG_OSD_MAP: case CEPH_MSG_OSD_BACKOFF: case CEPH_MSG_WATCH_NOTIFY: return alloc_msg_with_page_vector(hdr); case CEPH_MSG_OSD_OPREPLY: return get_reply(con, hdr, skip); default: pr_warn("%s osd%d unknown msg type %d, skipping\n", __func__, osd->o_osd, type); *skip = 1; return NULL; } } /* * Wrappers to refcount containing ceph_osd struct */ static struct ceph_connection *osd_get_con(struct ceph_connection *con) { struct ceph_osd *osd = con->private; if (get_osd(osd)) return con; return NULL; } static void osd_put_con(struct ceph_connection *con) { struct ceph_osd *osd = con->private; put_osd(osd); } /* * authentication */ /* * Note: returned pointer is the address of a structure that's * managed separately. Caller must *not* attempt to free it. */ static struct ceph_auth_handshake * osd_get_authorizer(struct ceph_connection *con, int *proto, int force_new) { struct ceph_osd *o = con->private; struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; struct ceph_auth_handshake *auth = &o->o_auth; int ret; ret = __ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_OSD, force_new, proto, NULL, NULL); if (ret) return ERR_PTR(ret); return auth; } static int osd_add_authorizer_challenge(struct ceph_connection *con, void *challenge_buf, int challenge_buf_len) { struct ceph_osd *o = con->private; struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; return ceph_auth_add_authorizer_challenge(ac, o->o_auth.authorizer, challenge_buf, challenge_buf_len); } static int osd_verify_authorizer_reply(struct ceph_connection *con) { struct ceph_osd *o = con->private; struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; struct ceph_auth_handshake *auth = &o->o_auth; return ceph_auth_verify_authorizer_reply(ac, auth->authorizer, auth->authorizer_reply_buf, auth->authorizer_reply_buf_len, NULL, NULL, NULL, NULL); } static int osd_invalidate_authorizer(struct ceph_connection *con) { struct ceph_osd *o = con->private; struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD); return ceph_monc_validate_auth(&osdc->client->monc); } static int osd_get_auth_request(struct ceph_connection *con, void *buf, int *buf_len, void **authorizer, int *authorizer_len) { struct ceph_osd *o = con->private; struct ceph_auth_client *ac = o->o_osdc->client->monc.auth; struct ceph_auth_handshake *auth = &o->o_auth; int ret; ret = ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_OSD, buf, buf_len); if (ret) return ret; *authorizer = auth->authorizer_buf; *authorizer_len = auth->authorizer_buf_len; return 0; } static int osd_handle_auth_reply_more(struct ceph_connection *con, void *reply, int reply_len, void *buf, int *buf_len, void **authorizer, int *authorizer_len) { struct ceph_osd *o = con->private; struct ceph_auth_client *ac = o->o_osdc->client->monc.auth; struct ceph_auth_handshake *auth = &o->o_auth; int ret; ret = ceph_auth_handle_svc_reply_more(ac, auth, reply, reply_len, buf, buf_len); if (ret) return ret; *authorizer = auth->authorizer_buf; *authorizer_len = auth->authorizer_buf_len; return 0; } static int osd_handle_auth_done(struct ceph_connection *con, u64 global_id, void *reply, int reply_len, u8 *session_key, int *session_key_len, u8 *con_secret, int *con_secret_len) { struct ceph_osd *o = con->private; struct ceph_auth_client *ac = o->o_osdc->client->monc.auth; struct ceph_auth_handshake *auth = &o->o_auth; return ceph_auth_handle_svc_reply_done(ac, auth, reply, reply_len, session_key, session_key_len, con_secret, con_secret_len); } static int osd_handle_auth_bad_method(struct ceph_connection *con, int used_proto, int result, const int *allowed_protos, int proto_cnt, const int *allowed_modes, int mode_cnt) { struct ceph_osd *o = con->private; struct ceph_mon_client *monc = &o->o_osdc->client->monc; int ret; if (ceph_auth_handle_bad_authorizer(monc->auth, CEPH_ENTITY_TYPE_OSD, used_proto, result, allowed_protos, proto_cnt, allowed_modes, mode_cnt)) { ret = ceph_monc_validate_auth(monc); if (ret) return ret; } return -EACCES; } static void osd_reencode_message(struct ceph_msg *msg) { int type = le16_to_cpu(msg->hdr.type); if (type == CEPH_MSG_OSD_OP) encode_request_finish(msg); } static int osd_sign_message(struct ceph_msg *msg) { struct ceph_osd *o = msg->con->private; struct ceph_auth_handshake *auth = &o->o_auth; return ceph_auth_sign_message(auth, msg); } static int osd_check_message_signature(struct ceph_msg *msg) { struct ceph_osd *o = msg->con->private; struct ceph_auth_handshake *auth = &o->o_auth; return ceph_auth_check_message_signature(auth, msg); } static void advance_cursor(struct ceph_msg_data_cursor *cursor, size_t len, bool zero) { while (len) { struct page *page; size_t poff, plen; page = ceph_msg_data_next(cursor, &poff, &plen); if (plen > len) plen = len; if (zero) zero_user_segment(page, poff, poff + plen); len -= plen; ceph_msg_data_advance(cursor, plen); } } static int prep_next_sparse_read(struct ceph_connection *con, struct ceph_msg_data_cursor *cursor) { struct ceph_osd *o = con->private; struct ceph_sparse_read *sr = &o->o_sparse_read; struct ceph_osd_request *req; struct ceph_osd_req_op *op; spin_lock(&o->o_requests_lock); req = lookup_request(&o->o_requests, le64_to_cpu(con->in_msg->hdr.tid)); if (!req) { spin_unlock(&o->o_requests_lock); return -EBADR; } if (o->o_sparse_op_idx < 0) { dout("%s: [%d] starting new sparse read req\n", __func__, o->o_osd); } else { u64 end; op = &req->r_ops[o->o_sparse_op_idx]; WARN_ON_ONCE(op->extent.sparse_ext); /* hand back buffer we took earlier */ op->extent.sparse_ext = sr->sr_extent; sr->sr_extent = NULL; op->extent.sparse_ext_cnt = sr->sr_count; sr->sr_ext_len = 0; dout("%s: [%d] completed extent array len %d cursor->resid %zd\n", __func__, o->o_osd, op->extent.sparse_ext_cnt, cursor->resid); /* Advance to end of data for this operation */ end = ceph_sparse_ext_map_end(op); if (end < sr->sr_req_len) advance_cursor(cursor, sr->sr_req_len - end, false); } ceph_init_sparse_read(sr); /* find next op in this request (if any) */ while (++o->o_sparse_op_idx < req->r_num_ops) { op = &req->r_ops[o->o_sparse_op_idx]; if (op->op == CEPH_OSD_OP_SPARSE_READ) goto found; } /* reset for next sparse read request */ spin_unlock(&o->o_requests_lock); o->o_sparse_op_idx = -1; return 0; found: sr->sr_req_off = op->extent.offset; sr->sr_req_len = op->extent.length; sr->sr_pos = sr->sr_req_off; dout("%s: [%d] new sparse read op at idx %d 0x%llx~0x%llx\n", __func__, o->o_osd, o->o_sparse_op_idx, sr->sr_req_off, sr->sr_req_len); /* hand off request's sparse extent map buffer */ sr->sr_ext_len = op->extent.sparse_ext_cnt; op->extent.sparse_ext_cnt = 0; sr->sr_extent = op->extent.sparse_ext; op->extent.sparse_ext = NULL; spin_unlock(&o->o_requests_lock); return 1; } #ifdef __BIG_ENDIAN static inline void convert_extent_map(struct ceph_sparse_read *sr) { int i; for (i = 0; i < sr->sr_count; i++) { struct ceph_sparse_extent *ext = &sr->sr_extent[i]; ext->off = le64_to_cpu((__force __le64)ext->off); ext->len = le64_to_cpu((__force __le64)ext->len); } } #else static inline void convert_extent_map(struct ceph_sparse_read *sr) { } #endif static int osd_sparse_read(struct ceph_connection *con, struct ceph_msg_data_cursor *cursor, char **pbuf) { struct ceph_osd *o = con->private; struct ceph_sparse_read *sr = &o->o_sparse_read; u32 count = sr->sr_count; u64 eoff, elen, len = 0; int i, ret; switch (sr->sr_state) { case CEPH_SPARSE_READ_HDR: next_op: ret = prep_next_sparse_read(con, cursor); if (ret <= 0) return ret; /* number of extents */ ret = sizeof(sr->sr_count); *pbuf = (char *)&sr->sr_count; sr->sr_state = CEPH_SPARSE_READ_EXTENTS; break; case CEPH_SPARSE_READ_EXTENTS: /* Convert sr_count to host-endian */ count = le32_to_cpu((__force __le32)sr->sr_count); sr->sr_count = count; dout("[%d] got %u extents\n", o->o_osd, count); if (count > 0) { if (!sr->sr_extent || count > sr->sr_ext_len) { /* no extent array provided, or too short */ kfree(sr->sr_extent); sr->sr_extent = kmalloc_array(count, sizeof(*sr->sr_extent), GFP_NOIO); if (!sr->sr_extent) { pr_err("%s: failed to allocate %u extents\n", __func__, count); return -ENOMEM; } sr->sr_ext_len = count; } ret = count * sizeof(*sr->sr_extent); *pbuf = (char *)sr->sr_extent; sr->sr_state = CEPH_SPARSE_READ_DATA_LEN; break; } /* No extents? Read data len */ fallthrough; case CEPH_SPARSE_READ_DATA_LEN: convert_extent_map(sr); ret = sizeof(sr->sr_datalen); *pbuf = (char *)&sr->sr_datalen; sr->sr_state = CEPH_SPARSE_READ_DATA_PRE; break; case CEPH_SPARSE_READ_DATA_PRE: /* Convert sr_datalen to host-endian */ sr->sr_datalen = le32_to_cpu((__force __le32)sr->sr_datalen); for (i = 0; i < count; i++) len += sr->sr_extent[i].len; if (sr->sr_datalen != len) { pr_warn_ratelimited("data len %u != extent len %llu\n", sr->sr_datalen, len); return -EREMOTEIO; } sr->sr_state = CEPH_SPARSE_READ_DATA; fallthrough; case CEPH_SPARSE_READ_DATA: if (sr->sr_index >= count) { sr->sr_state = CEPH_SPARSE_READ_HDR; goto next_op; } eoff = sr->sr_extent[sr->sr_index].off; elen = sr->sr_extent[sr->sr_index].len; dout("[%d] ext %d off 0x%llx len 0x%llx\n", o->o_osd, sr->sr_index, eoff, elen); if (elen > INT_MAX) { dout("Sparse read extent length too long (0x%llx)\n", elen); return -EREMOTEIO; } /* zero out anything from sr_pos to start of extent */ if (sr->sr_pos < eoff) advance_cursor(cursor, eoff - sr->sr_pos, true); /* Set position to end of extent */ sr->sr_pos = eoff + elen; /* send back the new length and nullify the ptr */ cursor->sr_resid = elen; ret = elen; *pbuf = NULL; /* Bump the array index */ ++sr->sr_index; break; } return ret; } static const struct ceph_connection_operations osd_con_ops = { .get = osd_get_con, .put = osd_put_con, .sparse_read = osd_sparse_read, .alloc_msg = osd_alloc_msg, .dispatch = osd_dispatch, .fault = osd_fault, .reencode_message = osd_reencode_message, .get_authorizer = osd_get_authorizer, .add_authorizer_challenge = osd_add_authorizer_challenge, .verify_authorizer_reply = osd_verify_authorizer_reply, .invalidate_authorizer = osd_invalidate_authorizer, .sign_message = osd_sign_message, .check_message_signature = osd_check_message_signature, .get_auth_request = osd_get_auth_request, .handle_auth_reply_more = osd_handle_auth_reply_more, .handle_auth_done = osd_handle_auth_done, .handle_auth_bad_method = osd_handle_auth_bad_method, };
2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 // SPDX-License-Identifier: GPL-2.0-or-later /* * Contiguous Memory Allocator * * Copyright (c) 2010-2011 by Samsung Electronics. * Copyright IBM Corporation, 2013 * Copyright LG Electronics Inc., 2014 * Written by: * Marek Szyprowski <m.szyprowski@samsung.com> * Michal Nazarewicz <mina86@mina86.com> * Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> * Joonsoo Kim <iamjoonsoo.kim@lge.com> */ #define pr_fmt(fmt) "cma: " fmt #define CREATE_TRACE_POINTS #include <linux/memblock.h> #include <linux/err.h> #include <linux/list.h> #include <linux/mm.h> #include <linux/sizes.h> #include <linux/slab.h> #include <linux/string_choices.h> #include <linux/log2.h> #include <linux/cma.h> #include <linux/highmem.h> #include <linux/io.h> #include <linux/kmemleak.h> #include <trace/events/cma.h> #include "internal.h" #include "cma.h" struct cma cma_areas[MAX_CMA_AREAS]; unsigned int cma_area_count; phys_addr_t cma_get_base(const struct cma *cma) { WARN_ON_ONCE(cma->nranges != 1); return PFN_PHYS(cma->ranges[0].base_pfn); } unsigned long cma_get_size(const struct cma *cma) { return cma->count << PAGE_SHIFT; } const char *cma_get_name(const struct cma *cma) { return cma->name; } static unsigned long cma_bitmap_aligned_mask(const struct cma *cma, unsigned int align_order) { if (align_order <= cma->order_per_bit) return 0; return (1UL << (align_order - cma->order_per_bit)) - 1; } /* * Find the offset of the base PFN from the specified align_order. * The value returned is represented in order_per_bits. */ static unsigned long cma_bitmap_aligned_offset(const struct cma *cma, const struct cma_memrange *cmr, unsigned int align_order) { return (cmr->base_pfn & ((1UL << align_order) - 1)) >> cma->order_per_bit; } static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma, unsigned long pages) { return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit; } static void cma_clear_bitmap(struct cma *cma, const struct cma_memrange *cmr, unsigned long pfn, unsigned long count) { unsigned long bitmap_no, bitmap_count; unsigned long flags; bitmap_no = (pfn - cmr->base_pfn) >> cma->order_per_bit; bitmap_count = cma_bitmap_pages_to_bits(cma, count); spin_lock_irqsave(&cma->lock, flags); bitmap_clear(cmr->bitmap, bitmap_no, bitmap_count); cma->available_count += count; spin_unlock_irqrestore(&cma->lock, flags); } /* * Check if a CMA area contains no ranges that intersect with * multiple zones. Store the result in the flags in case * this gets called more than once. */ bool cma_validate_zones(struct cma *cma) { int r; unsigned long base_pfn; struct cma_memrange *cmr; bool valid_bit_set; /* * If already validated, return result of previous check. * Either the valid or invalid bit will be set if this * check has already been done. If neither is set, the * check has not been performed yet. */ valid_bit_set = test_bit(CMA_ZONES_VALID, &cma->flags); if (valid_bit_set || test_bit(CMA_ZONES_INVALID, &cma->flags)) return valid_bit_set; for (r = 0; r < cma->nranges; r++) { cmr = &cma->ranges[r]; base_pfn = cmr->base_pfn; /* * alloc_contig_range() requires the pfn range specified * to be in the same zone. Simplify by forcing the entire * CMA resv range to be in the same zone. */ WARN_ON_ONCE(!pfn_valid(base_pfn)); if (pfn_range_intersects_zones(cma->nid, base_pfn, cmr->count)) { set_bit(CMA_ZONES_INVALID, &cma->flags); return false; } } set_bit(CMA_ZONES_VALID, &cma->flags); return true; } static void __init cma_activate_area(struct cma *cma) { unsigned long pfn, end_pfn, early_pfn[CMA_MAX_RANGES]; int allocrange, r; struct cma_memrange *cmr; unsigned long bitmap_count, count; for (allocrange = 0; allocrange < cma->nranges; allocrange++) { cmr = &cma->ranges[allocrange]; early_pfn[allocrange] = cmr->early_pfn; cmr->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma, cmr), GFP_KERNEL); if (!cmr->bitmap) goto cleanup; } if (!cma_validate_zones(cma)) goto cleanup; for (r = 0; r < cma->nranges; r++) { cmr = &cma->ranges[r]; if (early_pfn[r] != cmr->base_pfn) { count = early_pfn[r] - cmr->base_pfn; bitmap_count = cma_bitmap_pages_to_bits(cma, count); bitmap_set(cmr->bitmap, 0, bitmap_count); } for (pfn = early_pfn[r]; pfn < cmr->base_pfn + cmr->count; pfn += pageblock_nr_pages) init_cma_reserved_pageblock(pfn_to_page(pfn)); } spin_lock_init(&cma->lock); mutex_init(&cma->alloc_mutex); #ifdef CONFIG_CMA_DEBUGFS INIT_HLIST_HEAD(&cma->mem_head); spin_lock_init(&cma->mem_head_lock); #endif set_bit(CMA_ACTIVATED, &cma->flags); return; cleanup: for (r = 0; r < allocrange; r++) bitmap_free(cma->ranges[r].bitmap); /* Expose all pages to the buddy, they are useless for CMA. */ if (!test_bit(CMA_RESERVE_PAGES_ON_ERROR, &cma->flags)) { for (r = 0; r < allocrange; r++) { cmr = &cma->ranges[r]; end_pfn = cmr->base_pfn + cmr->count; for (pfn = early_pfn[r]; pfn < end_pfn; pfn++) free_reserved_page(pfn_to_page(pfn)); } } totalcma_pages -= cma->count; cma->available_count = cma->count = 0; pr_err("CMA area %s could not be activated\n", cma->name); } static int __init cma_init_reserved_areas(void) { int i; for (i = 0; i < cma_area_count; i++) cma_activate_area(&cma_areas[i]); return 0; } core_initcall(cma_init_reserved_areas); void __init cma_reserve_pages_on_error(struct cma *cma) { set_bit(CMA_RESERVE_PAGES_ON_ERROR, &cma->flags); } static int __init cma_new_area(const char *name, phys_addr_t size, unsigned int order_per_bit, struct cma **res_cma) { struct cma *cma; if (cma_area_count == ARRAY_SIZE(cma_areas)) { pr_err("Not enough slots for CMA reserved regions!\n"); return -ENOSPC; } /* * Each reserved area must be initialised later, when more kernel * subsystems (like slab allocator) are available. */ cma = &cma_areas[cma_area_count]; cma_area_count++; if (name) snprintf(cma->name, CMA_MAX_NAME, "%s", name); else snprintf(cma->name, CMA_MAX_NAME, "cma%d\n", cma_area_count); cma->available_count = cma->count = size >> PAGE_SHIFT; cma->order_per_bit = order_per_bit; *res_cma = cma; totalcma_pages += cma->count; return 0; } static void __init cma_drop_area(struct cma *cma) { totalcma_pages -= cma->count; cma_area_count--; } /** * cma_init_reserved_mem() - create custom contiguous area from reserved memory * @base: Base address of the reserved area * @size: Size of the reserved area (in bytes), * @order_per_bit: Order of pages represented by one bit on bitmap. * @name: The name of the area. If this parameter is NULL, the name of * the area will be set to "cmaN", where N is a running counter of * used areas. * @res_cma: Pointer to store the created cma region. * * This function creates custom contiguous area from already reserved memory. */ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, unsigned int order_per_bit, const char *name, struct cma **res_cma) { struct cma *cma; int ret; /* Sanity checks */ if (!size || !memblock_is_region_reserved(base, size)) return -EINVAL; /* * CMA uses CMA_MIN_ALIGNMENT_BYTES as alignment requirement which * needs pageblock_order to be initialized. Let's enforce it. */ if (!pageblock_order) { pr_err("pageblock_order not yet initialized. Called during early boot?\n"); return -EINVAL; } /* ensure minimal alignment required by mm core */ if (!IS_ALIGNED(base | size, CMA_MIN_ALIGNMENT_BYTES)) return -EINVAL; ret = cma_new_area(name, size, order_per_bit, &cma); if (ret != 0) return ret; cma->ranges[0].base_pfn = PFN_DOWN(base); cma->ranges[0].early_pfn = PFN_DOWN(base); cma->ranges[0].count = cma->count; cma->nranges = 1; cma->nid = NUMA_NO_NODE; *res_cma = cma; return 0; } /* * Structure used while walking physical memory ranges and finding out * which one(s) to use for a CMA area. */ struct cma_init_memrange { phys_addr_t base; phys_addr_t size; struct list_head list; }; /* * Work array used during CMA initialization. */ static struct cma_init_memrange memranges[CMA_MAX_RANGES] __initdata; static bool __init revsizecmp(struct cma_init_memrange *mlp, struct cma_init_memrange *mrp) { return mlp->size > mrp->size; } static bool __init basecmp(struct cma_init_memrange *mlp, struct cma_init_memrange *mrp) { return mlp->base < mrp->base; } /* * Helper function to create sorted lists. */ static void __init list_insert_sorted( struct list_head *ranges, struct cma_init_memrange *mrp, bool (*cmp)(struct cma_init_memrange *lh, struct cma_init_memrange *rh)) { struct list_head *mp; struct cma_init_memrange *mlp; if (list_empty(ranges)) list_add(&mrp->list, ranges); else { list_for_each(mp, ranges) { mlp = list_entry(mp, struct cma_init_memrange, list); if (cmp(mlp, mrp)) break; } __list_add(&mrp->list, mlp->list.prev, &mlp->list); } } static int __init cma_fixed_reserve(phys_addr_t base, phys_addr_t size) { if (IS_ENABLED(CONFIG_HIGHMEM)) { phys_addr_t highmem_start = __pa(high_memory - 1) + 1; /* * If allocating at a fixed base the request region must not * cross the low/high memory boundary. */ if (base < highmem_start && base + size > highmem_start) { pr_err("Region at %pa defined on low/high memory boundary (%pa)\n", &base, &highmem_start); return -EINVAL; } } if (memblock_is_region_reserved(base, size) || memblock_reserve(base, size) < 0) { return -EBUSY; } return 0; } static phys_addr_t __init cma_alloc_mem(phys_addr_t base, phys_addr_t size, phys_addr_t align, phys_addr_t limit, int nid) { phys_addr_t addr = 0; /* * If there is enough memory, try a bottom-up allocation first. * It will place the new cma area close to the start of the node * and guarantee that the compaction is moving pages out of the * cma area and not into it. * Avoid using first 4GB to not interfere with constrained zones * like DMA/DMA32. */ #ifdef CONFIG_PHYS_ADDR_T_64BIT if (!memblock_bottom_up() && limit >= SZ_4G + size) { memblock_set_bottom_up(true); addr = memblock_alloc_range_nid(size, align, SZ_4G, limit, nid, true); memblock_set_bottom_up(false); } #endif /* * On systems with HIGHMEM try allocating from there before consuming * memory in lower zones. */ if (!addr && IS_ENABLED(CONFIG_HIGHMEM)) { phys_addr_t highmem = __pa(high_memory - 1) + 1; /* * All pages in the reserved area must come from the same zone. * If the requested region crosses the low/high memory boundary, * try allocating from high memory first and fall back to low * memory in case of failure. */ if (base < highmem && limit > highmem) { addr = memblock_alloc_range_nid(size, align, highmem, limit, nid, true); limit = highmem; } } if (!addr) addr = memblock_alloc_range_nid(size, align, base, limit, nid, true); return addr; } static int __init __cma_declare_contiguous_nid(phys_addr_t *basep, phys_addr_t size, phys_addr_t limit, phys_addr_t alignment, unsigned int order_per_bit, bool fixed, const char *name, struct cma **res_cma, int nid) { phys_addr_t memblock_end = memblock_end_of_DRAM(); phys_addr_t base = *basep; int ret; pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n", __func__, &size, &base, &limit, &alignment); if (cma_area_count == ARRAY_SIZE(cma_areas)) { pr_err("Not enough slots for CMA reserved regions!\n"); return -ENOSPC; } if (!size) return -EINVAL; if (alignment && !is_power_of_2(alignment)) return -EINVAL; if (!IS_ENABLED(CONFIG_NUMA)) nid = NUMA_NO_NODE; /* Sanitise input arguments. */ alignment = max_t(phys_addr_t, alignment, CMA_MIN_ALIGNMENT_BYTES); if (fixed && base & (alignment - 1)) { pr_err("Region at %pa must be aligned to %pa bytes\n", &base, &alignment); return -EINVAL; } base = ALIGN(base, alignment); size = ALIGN(size, alignment); limit &= ~(alignment - 1); if (!base) fixed = false; /* size should be aligned with order_per_bit */ if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit)) return -EINVAL; /* * If the limit is unspecified or above the memblock end, its effective * value will be the memblock end. Set it explicitly to simplify further * checks. */ if (limit == 0 || limit > memblock_end) limit = memblock_end; if (base + size > limit) { pr_err("Size (%pa) of region at %pa exceeds limit (%pa)\n", &size, &base, &limit); return -EINVAL; } /* Reserve memory */ if (fixed) { ret = cma_fixed_reserve(base, size); if (ret) return ret; } else { base = cma_alloc_mem(base, size, alignment, limit, nid); if (!base) return -ENOMEM; /* * kmemleak scans/reads tracked objects for pointers to other * objects but this address isn't mapped and accessible */ kmemleak_ignore_phys(base); } ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma); if (ret) { memblock_phys_free(base, size); return ret; } (*res_cma)->nid = nid; *basep = base; return 0; } /* * Create CMA areas with a total size of @total_size. A normal allocation * for one area is tried first. If that fails, the biggest memblock * ranges above 4G are selected, and allocated bottom up. * * The complexity here is not great, but this function will only be * called during boot, and the lists operated on have fewer than * CMA_MAX_RANGES elements (default value: 8). */ int __init cma_declare_contiguous_multi(phys_addr_t total_size, phys_addr_t align, unsigned int order_per_bit, const char *name, struct cma **res_cma, int nid) { phys_addr_t start = 0, end; phys_addr_t size, sizesum, sizeleft; struct cma_init_memrange *mrp, *mlp, *failed; struct cma_memrange *cmrp; LIST_HEAD(ranges); LIST_HEAD(final_ranges); struct list_head *mp, *next; int ret, nr = 1; u64 i; struct cma *cma; /* * First, try it the normal way, producing just one range. */ ret = __cma_declare_contiguous_nid(&start, total_size, 0, align, order_per_bit, false, name, res_cma, nid); if (ret != -ENOMEM) goto out; /* * Couldn't find one range that fits our needs, so try multiple * ranges. * * No need to do the alignment checks here, the call to * cma_declare_contiguous_nid above would have caught * any issues. With the checks, we know that: * * - @align is a power of 2 * - @align is >= pageblock alignment * - @size is aligned to @align and to @order_per_bit * * So, as long as we create ranges that have a base * aligned to @align, and a size that is aligned to * both @align and @order_to_bit, things will work out. */ nr = 0; sizesum = 0; failed = NULL; ret = cma_new_area(name, total_size, order_per_bit, &cma); if (ret != 0) goto out; align = max_t(phys_addr_t, align, CMA_MIN_ALIGNMENT_BYTES); /* * Create a list of ranges above 4G, largest range first. */ for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &start, &end, NULL) { if (upper_32_bits(start) == 0) continue; start = ALIGN(start, align); if (start >= end) continue; end = ALIGN_DOWN(end, align); if (end <= start) continue; size = end - start; size = ALIGN_DOWN(size, (PAGE_SIZE << order_per_bit)); if (!size) continue; sizesum += size; pr_debug("consider %016llx - %016llx\n", (u64)start, (u64)end); /* * If we don't yet have used the maximum number of * areas, grab a new one. * * If we can't use anymore, see if this range is not * smaller than the smallest one already recorded. If * not, re-use the smallest element. */ if (nr < CMA_MAX_RANGES) mrp = &memranges[nr++]; else { mrp = list_last_entry(&ranges, struct cma_init_memrange, list); if (size < mrp->size) continue; list_del(&mrp->list); sizesum -= mrp->size; pr_debug("deleted %016llx - %016llx from the list\n", (u64)mrp->base, (u64)mrp->base + size); } mrp->base = start; mrp->size = size; /* * Now do a sorted insert. */ list_insert_sorted(&ranges, mrp, revsizecmp); pr_debug("added %016llx - %016llx to the list\n", (u64)mrp->base, (u64)mrp->base + size); pr_debug("total size now %llu\n", (u64)sizesum); } /* * There is not enough room in the CMA_MAX_RANGES largest * ranges, so bail out. */ if (sizesum < total_size) { cma_drop_area(cma); ret = -ENOMEM; goto out; } /* * Found ranges that provide enough combined space. * Now, sorted them by address, smallest first, because we * want to mimic a bottom-up memblock allocation. */ sizesum = 0; list_for_each_safe(mp, next, &ranges) { mlp = list_entry(mp, struct cma_init_memrange, list); list_del(mp); list_insert_sorted(&final_ranges, mlp, basecmp); sizesum += mlp->size; if (sizesum >= total_size) break; } /* * Walk the final list, and add a CMA range for * each range, possibly not using the last one fully. */ nr = 0; sizeleft = total_size; list_for_each(mp, &final_ranges) { mlp = list_entry(mp, struct cma_init_memrange, list); size = min(sizeleft, mlp->size); if (memblock_reserve(mlp->base, size)) { /* * Unexpected error. Could go on to * the next one, but just abort to * be safe. */ failed = mlp; break; } pr_debug("created region %d: %016llx - %016llx\n", nr, (u64)mlp->base, (u64)mlp->base + size); cmrp = &cma->ranges[nr++]; cmrp->base_pfn = PHYS_PFN(mlp->base); cmrp->early_pfn = cmrp->base_pfn; cmrp->count = size >> PAGE_SHIFT; sizeleft -= size; if (sizeleft == 0) break; } if (failed) { list_for_each(mp, &final_ranges) { mlp = list_entry(mp, struct cma_init_memrange, list); if (mlp == failed) break; memblock_phys_free(mlp->base, mlp->size); } cma_drop_area(cma); ret = -ENOMEM; goto out; } cma->nranges = nr; cma->nid = nid; *res_cma = cma; out: if (ret != 0) pr_err("Failed to reserve %lu MiB\n", (unsigned long)total_size / SZ_1M); else pr_info("Reserved %lu MiB in %d range%s\n", (unsigned long)total_size / SZ_1M, nr, str_plural(nr)); return ret; } /** * cma_declare_contiguous_nid() - reserve custom contiguous area * @base: Base address of the reserved area optional, use 0 for any * @size: Size of the reserved area (in bytes), * @limit: End address of the reserved memory (optional, 0 for any). * @alignment: Alignment for the CMA area, should be power of 2 or zero * @order_per_bit: Order of pages represented by one bit on bitmap. * @fixed: hint about where to place the reserved area * @name: The name of the area. See function cma_init_reserved_mem() * @res_cma: Pointer to store the created cma region. * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * * This function reserves memory from early allocator. It should be * called by arch specific code once the early allocator (memblock or bootmem) * has been activated and all other subsystems have already allocated/reserved * memory. This function allows to create custom reserved areas. * * If @fixed is true, reserve contiguous area at exactly @base. If false, * reserve in range from @base to @limit. */ int __init cma_declare_contiguous_nid(phys_addr_t base, phys_addr_t size, phys_addr_t limit, phys_addr_t alignment, unsigned int order_per_bit, bool fixed, const char *name, struct cma **res_cma, int nid) { int ret; ret = __cma_declare_contiguous_nid(&base, size, limit, alignment, order_per_bit, fixed, name, res_cma, nid); if (ret != 0) pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); else pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M, &base); return ret; } static void cma_debug_show_areas(struct cma *cma) { unsigned long start, end; unsigned long nr_part; unsigned long nbits; int r; struct cma_memrange *cmr; spin_lock_irq(&cma->lock); pr_info("number of available pages: "); for (r = 0; r < cma->nranges; r++) { cmr = &cma->ranges[r]; nbits = cma_bitmap_maxno(cma, cmr); pr_info("range %d: ", r); for_each_clear_bitrange(start, end, cmr->bitmap, nbits) { nr_part = (end - start) << cma->order_per_bit; pr_cont("%s%lu@%lu", start ? "+" : "", nr_part, start); } pr_info("\n"); } pr_cont("=> %lu free of %lu total pages\n", cma->available_count, cma->count); spin_unlock_irq(&cma->lock); } static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr, unsigned long count, unsigned int align, struct page **pagep, gfp_t gfp) { unsigned long mask, offset; unsigned long pfn = -1; unsigned long start = 0; unsigned long bitmap_maxno, bitmap_no, bitmap_count; int ret = -EBUSY; struct page *page = NULL; mask = cma_bitmap_aligned_mask(cma, align); offset = cma_bitmap_aligned_offset(cma, cmr, align); bitmap_maxno = cma_bitmap_maxno(cma, cmr); bitmap_count = cma_bitmap_pages_to_bits(cma, count); if (bitmap_count > bitmap_maxno) goto out; for (;;) { spin_lock_irq(&cma->lock); /* * If the request is larger than the available number * of pages, stop right away. */ if (count > cma->available_count) { spin_unlock_irq(&cma->lock); break; } bitmap_no = bitmap_find_next_zero_area_off(cmr->bitmap, bitmap_maxno, start, bitmap_count, mask, offset); if (bitmap_no >= bitmap_maxno) { spin_unlock_irq(&cma->lock); break; } bitmap_set(cmr->bitmap, bitmap_no, bitmap_count); cma->available_count -= count; /* * It's safe to drop the lock here. We've marked this region for * our exclusive use. If the migration fails we will take the * lock again and unmark it. */ spin_unlock_irq(&cma->lock); pfn = cmr->base_pfn + (bitmap_no << cma->order_per_bit); mutex_lock(&cma->alloc_mutex); ret = alloc_contig_range(pfn, pfn + count, ACR_FLAGS_CMA, gfp); mutex_unlock(&cma->alloc_mutex); if (ret == 0) { page = pfn_to_page(pfn); break; } cma_clear_bitmap(cma, cmr, pfn, count); if (ret != -EBUSY) break; pr_debug("%s(): memory range at pfn 0x%lx %p is busy, retrying\n", __func__, pfn, pfn_to_page(pfn)); trace_cma_alloc_busy_retry(cma->name, pfn, pfn_to_page(pfn), count, align); /* try again with a bit different memory target */ start = bitmap_no + mask + 1; } out: *pagep = page; return ret; } static struct page *__cma_alloc(struct cma *cma, unsigned long count, unsigned int align, gfp_t gfp) { struct page *page = NULL; int ret = -ENOMEM, r; unsigned long i; const char *name = cma ? cma->name : NULL; if (!cma || !cma->count) return page; pr_debug("%s(cma %p, name: %s, count %lu, align %d)\n", __func__, (void *)cma, cma->name, count, align); if (!count) return page; trace_cma_alloc_start(name, count, align); for (r = 0; r < cma->nranges; r++) { page = NULL; ret = cma_range_alloc(cma, &cma->ranges[r], count, align, &page, gfp); if (ret != -EBUSY || page) break; } /* * CMA can allocate multiple page blocks, which results in different * blocks being marked with different tags. Reset the tags to ignore * those page blocks. */ if (page) { for (i = 0; i < count; i++) page_kasan_tag_reset(nth_page(page, i)); } if (ret && !(gfp & __GFP_NOWARN)) { pr_err_ratelimited("%s: %s: alloc failed, req-size: %lu pages, ret: %d\n", __func__, cma->name, count, ret); cma_debug_show_areas(cma); } pr_debug("%s(): returned %p\n", __func__, page); trace_cma_alloc_finish(name, page ? page_to_pfn(page) : 0, page, count, align, ret); if (page) { count_vm_event(CMA_ALLOC_SUCCESS); cma_sysfs_account_success_pages(cma, count); } else { count_vm_event(CMA_ALLOC_FAIL); cma_sysfs_account_fail_pages(cma, count); } return page; } /** * cma_alloc() - allocate pages from contiguous area * @cma: Contiguous memory region for which the allocation is performed. * @count: Requested number of pages. * @align: Requested alignment of pages (in PAGE_SIZE order). * @no_warn: Avoid printing message about failed allocation * * This function allocates part of contiguous memory on specific * contiguous memory area. */ struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned int align, bool no_warn) { return __cma_alloc(cma, count, align, GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0)); } struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp) { struct page *page; if (WARN_ON(!order || !(gfp & __GFP_COMP))) return NULL; page = __cma_alloc(cma, 1 << order, order, gfp); return page ? page_folio(page) : NULL; } bool cma_pages_valid(struct cma *cma, const struct page *pages, unsigned long count) { unsigned long pfn, end; int r; struct cma_memrange *cmr; bool ret; if (!cma || !pages || count > cma->count) return false; pfn = page_to_pfn(pages); ret = false; for (r = 0; r < cma->nranges; r++) { cmr = &cma->ranges[r]; end = cmr->base_pfn + cmr->count; if (pfn >= cmr->base_pfn && pfn < end) { ret = pfn + count <= end; break; } } if (!ret) pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count); return ret; } /** * cma_release() - release allocated pages * @cma: Contiguous memory region for which the allocation is performed. * @pages: Allocated pages. * @count: Number of allocated pages. * * This function releases memory allocated by cma_alloc(). * It returns false when provided pages do not belong to contiguous area and * true otherwise. */ bool cma_release(struct cma *cma, const struct page *pages, unsigned long count) { struct cma_memrange *cmr; unsigned long pfn, end_pfn; int r; pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count); if (!cma_pages_valid(cma, pages, count)) return false; pfn = page_to_pfn(pages); end_pfn = pfn + count; for (r = 0; r < cma->nranges; r++) { cmr = &cma->ranges[r]; if (pfn >= cmr->base_pfn && pfn < (cmr->base_pfn + cmr->count)) { VM_BUG_ON(end_pfn > cmr->base_pfn + cmr->count); break; } } if (r == cma->nranges) return false; free_contig_range(pfn, count); cma_clear_bitmap(cma, cmr, pfn, count); cma_sysfs_account_release_pages(cma, count); trace_cma_release(cma->name, pfn, pages, count); return true; } bool cma_free_folio(struct cma *cma, const struct folio *folio) { if (WARN_ON(!folio_test_large(folio))) return false; return cma_release(cma, &folio->page, folio_nr_pages(folio)); } int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data) { int i; for (i = 0; i < cma_area_count; i++) { int ret = it(&cma_areas[i], data); if (ret) return ret; } return 0; } bool cma_intersects(struct cma *cma, unsigned long start, unsigned long end) { int r; struct cma_memrange *cmr; unsigned long rstart, rend; for (r = 0; r < cma->nranges; r++) { cmr = &cma->ranges[r]; rstart = PFN_PHYS(cmr->base_pfn); rend = PFN_PHYS(cmr->base_pfn + cmr->count); if (end < rstart) continue; if (start >= rend) continue; return true; } return false; } /* * Very basic function to reserve memory from a CMA area that has not * yet been activated. This is expected to be called early, when the * system is single-threaded, so there is no locking. The alignment * checking is restrictive - only pageblock-aligned areas * (CMA_MIN_ALIGNMENT_BYTES) may be reserved through this function. * This keeps things simple, and is enough for the current use case. * * The CMA bitmaps have not yet been allocated, so just start * reserving from the bottom up, using a PFN to keep track * of what has been reserved. Unreserving is not possible. * * The caller is responsible for initializing the page structures * in the area properly, since this just points to memblock-allocated * memory. The caller should subsequently use init_cma_pageblock to * set the migrate type and CMA stats the pageblocks that were reserved. * * If the CMA area fails to activate later, memory obtained through * this interface is not handed to the page allocator, this is * the responsibility of the caller (e.g. like normal memblock-allocated * memory). */ void __init *cma_reserve_early(struct cma *cma, unsigned long size) { int r; struct cma_memrange *cmr; unsigned long available; void *ret = NULL; if (!cma || !cma->count) return NULL; /* * Can only be called early in init. */ if (test_bit(CMA_ACTIVATED, &cma->flags)) return NULL; if (!IS_ALIGNED(size, CMA_MIN_ALIGNMENT_BYTES)) return NULL; if (!IS_ALIGNED(size, (PAGE_SIZE << cma->order_per_bit))) return NULL; size >>= PAGE_SHIFT; if (size > cma->available_count) return NULL; for (r = 0; r < cma->nranges; r++) { cmr = &cma->ranges[r]; available = cmr->count - (cmr->early_pfn - cmr->base_pfn); if (size <= available) { ret = phys_to_virt(PFN_PHYS(cmr->early_pfn)); cmr->early_pfn += size; cma->available_count -= size; return ret; } } return ret; }
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2001 Vojtech Pavlik * * CATC EL1210A NetMate USB Ethernet driver * * Sponsored by SuSE * * Based on the work of * Donald Becker * * Old chipset support added by Simon Evans <spse@secret.org.uk> 2002 * - adds support for Belkin F5U011 */ /* * * Should you need to contact me, the author, you can do so either by * e-mail - mail your message to <vojtech@suse.cz>, or by paper mail: * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/spinlock.h> #include <linux/ethtool.h> #include <linux/crc32.h> #include <linux/bitops.h> #include <linux/gfp.h> #include <linux/uaccess.h> #undef DEBUG #include <linux/usb.h> /* * Version information. */ #define DRIVER_VERSION "v2.8" #define DRIVER_AUTHOR "Vojtech Pavlik <vojtech@suse.cz>" #define DRIVER_DESC "CATC EL1210A NetMate USB Ethernet driver" #define SHORT_DRIVER_DESC "EL1210A NetMate USB Ethernet" MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL"); static const char driver_name[] = "catc"; /* * Some defines. */ #define STATS_UPDATE (HZ) /* Time between stats updates */ #define TX_TIMEOUT (5*HZ) /* Max time the queue can be stopped */ #define PKT_SZ 1536 /* Max Ethernet packet size */ #define RX_MAX_BURST 15 /* Max packets per rx buffer (> 0, < 16) */ #define TX_MAX_BURST 15 /* Max full sized packets per tx buffer (> 0) */ #define CTRL_QUEUE 16 /* Max control requests in flight (power of two) */ #define RX_PKT_SZ 1600 /* Max size of receive packet for F5U011 */ /* * Control requests. */ enum control_requests { ReadMem = 0xf1, GetMac = 0xf2, Reset = 0xf4, SetMac = 0xf5, SetRxMode = 0xf5, /* F5U011 only */ WriteROM = 0xf8, SetReg = 0xfa, GetReg = 0xfb, WriteMem = 0xfc, ReadROM = 0xfd, }; /* * Registers. */ enum register_offsets { TxBufCount = 0x20, RxBufCount = 0x21, OpModes = 0x22, TxQed = 0x23, RxQed = 0x24, MaxBurst = 0x25, RxUnit = 0x60, EthStatus = 0x61, StationAddr0 = 0x67, EthStats = 0x69, LEDCtrl = 0x81, }; enum eth_stats { TxSingleColl = 0x00, TxMultiColl = 0x02, TxExcessColl = 0x04, RxFramErr = 0x06, }; enum op_mode_bits { Op3MemWaits = 0x03, OpLenInclude = 0x08, OpRxMerge = 0x10, OpTxMerge = 0x20, OpWin95bugfix = 0x40, OpLoopback = 0x80, }; enum rx_filter_bits { RxEnable = 0x01, RxPolarity = 0x02, RxForceOK = 0x04, RxMultiCast = 0x08, RxPromisc = 0x10, AltRxPromisc = 0x20, /* F5U011 uses different bit */ }; enum led_values { LEDFast = 0x01, LEDSlow = 0x02, LEDFlash = 0x03, LEDPulse = 0x04, LEDLink = 0x08, }; enum link_status { LinkNoChange = 0, LinkGood = 1, LinkBad = 2 }; /* * The catc struct. */ #define CTRL_RUNNING 0 #define RX_RUNNING 1 #define TX_RUNNING 2 struct catc { struct net_device *netdev; struct usb_device *usbdev; unsigned long flags; unsigned int tx_ptr, tx_idx; unsigned int ctrl_head, ctrl_tail; spinlock_t tx_lock, ctrl_lock; u8 tx_buf[2][TX_MAX_BURST * (PKT_SZ + 2)]; u8 rx_buf[RX_MAX_BURST * (PKT_SZ + 2)]; u8 irq_buf[2]; u8 ctrl_buf[64]; struct usb_ctrlrequest ctrl_dr; struct timer_list timer; u8 stats_buf[8]; u16 stats_vals[4]; unsigned long last_stats; u8 multicast[64]; struct ctrl_queue { u8 dir; u8 request; u16 value; u16 index; void *buf; int len; void (*callback)(struct catc *catc, struct ctrl_queue *q); } ctrl_queue[CTRL_QUEUE]; struct urb *tx_urb, *rx_urb, *irq_urb, *ctrl_urb; u8 is_f5u011; /* Set if device is an F5U011 */ u8 rxmode[2]; /* Used for F5U011 */ atomic_t recq_sz; /* Used for F5U011 - counter of waiting rx packets */ }; /* * Useful macros. */ #define catc_get_mac(catc, mac) catc_ctrl_msg(catc, USB_DIR_IN, GetMac, 0, 0, mac, 6) #define catc_reset(catc) catc_ctrl_msg(catc, USB_DIR_OUT, Reset, 0, 0, NULL, 0) #define catc_set_reg(catc, reg, val) catc_ctrl_msg(catc, USB_DIR_OUT, SetReg, val, reg, NULL, 0) #define catc_get_reg(catc, reg, buf) catc_ctrl_msg(catc, USB_DIR_IN, GetReg, 0, reg, buf, 1) #define catc_write_mem(catc, addr, buf, size) catc_ctrl_msg(catc, USB_DIR_OUT, WriteMem, 0, addr, buf, size) #define catc_read_mem(catc, addr, buf, size) catc_ctrl_msg(catc, USB_DIR_IN, ReadMem, 0, addr, buf, size) #define f5u011_rxmode(catc, rxmode) catc_ctrl_msg(catc, USB_DIR_OUT, SetRxMode, 0, 1, rxmode, 2) #define f5u011_rxmode_async(catc, rxmode) catc_ctrl_async(catc, USB_DIR_OUT, SetRxMode, 0, 1, &rxmode, 2, NULL) #define f5u011_mchash_async(catc, hash) catc_ctrl_async(catc, USB_DIR_OUT, SetRxMode, 0, 2, &hash, 8, NULL) #define catc_set_reg_async(catc, reg, val) catc_ctrl_async(catc, USB_DIR_OUT, SetReg, val, reg, NULL, 0, NULL) #define catc_get_reg_async(catc, reg, cb) catc_ctrl_async(catc, USB_DIR_IN, GetReg, 0, reg, NULL, 1, cb) #define catc_write_mem_async(catc, addr, buf, size) catc_ctrl_async(catc, USB_DIR_OUT, WriteMem, 0, addr, buf, size, NULL) /* * Receive routines. */ static void catc_rx_done(struct urb *urb) { struct catc *catc = urb->context; u8 *pkt_start = urb->transfer_buffer; struct sk_buff *skb; int pkt_len, pkt_offset = 0; int status = urb->status; if (!catc->is_f5u011) { clear_bit(RX_RUNNING, &catc->flags); pkt_offset = 2; } if (status) { dev_dbg(&urb->dev->dev, "rx_done, status %d, length %d\n", status, urb->actual_length); return; } do { if(!catc->is_f5u011) { pkt_len = le16_to_cpup((__le16*)pkt_start); if (pkt_len > urb->actual_length) { catc->netdev->stats.rx_length_errors++; catc->netdev->stats.rx_errors++; break; } } else { pkt_len = urb->actual_length; } if (!(skb = dev_alloc_skb(pkt_len))) return; skb_copy_to_linear_data(skb, pkt_start + pkt_offset, pkt_len); skb_put(skb, pkt_len); skb->protocol = eth_type_trans(skb, catc->netdev); netif_rx(skb); catc->netdev->stats.rx_packets++; catc->netdev->stats.rx_bytes += pkt_len; /* F5U011 only does one packet per RX */ if (catc->is_f5u011) break; pkt_start += (((pkt_len + 1) >> 6) + 1) << 6; } while (pkt_start - (u8 *) urb->transfer_buffer < urb->actual_length); if (catc->is_f5u011) { if (atomic_read(&catc->recq_sz)) { int state; atomic_dec(&catc->recq_sz); netdev_dbg(catc->netdev, "getting extra packet\n"); urb->dev = catc->usbdev; if ((state = usb_submit_urb(urb, GFP_ATOMIC)) < 0) { netdev_dbg(catc->netdev, "submit(rx_urb) status %d\n", state); } } else { clear_bit(RX_RUNNING, &catc->flags); } } } static void catc_irq_done(struct urb *urb) { struct catc *catc = urb->context; u8 *data = urb->transfer_buffer; int status = urb->status; unsigned int hasdata, linksts = LinkNoChange; int res; if (!catc->is_f5u011) { hasdata = data[1] & 0x80; if (data[1] & 0x40) linksts = LinkGood; else if (data[1] & 0x20) linksts = LinkBad; } else { hasdata = (unsigned int)(be16_to_cpup((__be16*)data) & 0x0fff); if (data[0] == 0x90) linksts = LinkGood; else if (data[0] == 0xA0) linksts = LinkBad; } switch (status) { case 0: /* success */ break; case -ECONNRESET: /* unlink */ case -ENOENT: case -ESHUTDOWN: return; /* -EPIPE: should clear the halt */ default: /* error */ dev_dbg(&urb->dev->dev, "irq_done, status %d, data %02x %02x.\n", status, data[0], data[1]); goto resubmit; } if (linksts == LinkGood) { netif_carrier_on(catc->netdev); netdev_dbg(catc->netdev, "link ok\n"); } if (linksts == LinkBad) { netif_carrier_off(catc->netdev); netdev_dbg(catc->netdev, "link bad\n"); } if (hasdata) { if (test_and_set_bit(RX_RUNNING, &catc->flags)) { if (catc->is_f5u011) atomic_inc(&catc->recq_sz); } else { catc->rx_urb->dev = catc->usbdev; if ((res = usb_submit_urb(catc->rx_urb, GFP_ATOMIC)) < 0) { dev_err(&catc->usbdev->dev, "submit(rx_urb) status %d\n", res); } } } resubmit: res = usb_submit_urb (urb, GFP_ATOMIC); if (res) dev_err(&catc->usbdev->dev, "can't resubmit intr, %s-%s, status %d\n", catc->usbdev->bus->bus_name, catc->usbdev->devpath, res); } /* * Transmit routines. */ static int catc_tx_run(struct catc *catc) { int status; if (catc->is_f5u011) catc->tx_ptr = (catc->tx_ptr + 63) & ~63; catc->tx_urb->transfer_buffer_length = catc->tx_ptr; catc->tx_urb->transfer_buffer = catc->tx_buf[catc->tx_idx]; catc->tx_urb->dev = catc->usbdev; if ((status = usb_submit_urb(catc->tx_urb, GFP_ATOMIC)) < 0) dev_err(&catc->usbdev->dev, "submit(tx_urb), status %d\n", status); catc->tx_idx = !catc->tx_idx; catc->tx_ptr = 0; netif_trans_update(catc->netdev); return status; } static void catc_tx_done(struct urb *urb) { struct catc *catc = urb->context; unsigned long flags; int r, status = urb->status; if (status == -ECONNRESET) { dev_dbg(&urb->dev->dev, "Tx Reset.\n"); urb->status = 0; netif_trans_update(catc->netdev); catc->netdev->stats.tx_errors++; clear_bit(TX_RUNNING, &catc->flags); netif_wake_queue(catc->netdev); return; } if (status) { dev_dbg(&urb->dev->dev, "tx_done, status %d, length %d\n", status, urb->actual_length); return; } spin_lock_irqsave(&catc->tx_lock, flags); if (catc->tx_ptr) { r = catc_tx_run(catc); if (unlikely(r < 0)) clear_bit(TX_RUNNING, &catc->flags); } else { clear_bit(TX_RUNNING, &catc->flags); } netif_wake_queue(catc->netdev); spin_unlock_irqrestore(&catc->tx_lock, flags); } static netdev_tx_t catc_start_xmit(struct sk_buff *skb, struct net_device *netdev) { struct catc *catc = netdev_priv(netdev); unsigned long flags; int r = 0; char *tx_buf; spin_lock_irqsave(&catc->tx_lock, flags); catc->tx_ptr = (((catc->tx_ptr - 1) >> 6) + 1) << 6; tx_buf = catc->tx_buf[catc->tx_idx] + catc->tx_ptr; if (catc->is_f5u011) *(__be16 *)tx_buf = cpu_to_be16(skb->len); else *(__le16 *)tx_buf = cpu_to_le16(skb->len); skb_copy_from_linear_data(skb, tx_buf + 2, skb->len); catc->tx_ptr += skb->len + 2; if (!test_and_set_bit(TX_RUNNING, &catc->flags)) { r = catc_tx_run(catc); if (r < 0) clear_bit(TX_RUNNING, &catc->flags); } if ((catc->is_f5u011 && catc->tx_ptr) || (catc->tx_ptr >= ((TX_MAX_BURST - 1) * (PKT_SZ + 2)))) netif_stop_queue(netdev); spin_unlock_irqrestore(&catc->tx_lock, flags); if (r >= 0) { catc->netdev->stats.tx_bytes += skb->len; catc->netdev->stats.tx_packets++; } dev_kfree_skb(skb); return NETDEV_TX_OK; } static void catc_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct catc *catc = netdev_priv(netdev); dev_warn(&netdev->dev, "Transmit timed out.\n"); usb_unlink_urb(catc->tx_urb); } /* * Control messages. */ static int catc_ctrl_msg(struct catc *catc, u8 dir, u8 request, u16 value, u16 index, void *buf, int len) { int retval = usb_control_msg(catc->usbdev, dir ? usb_rcvctrlpipe(catc->usbdev, 0) : usb_sndctrlpipe(catc->usbdev, 0), request, 0x40 | dir, value, index, buf, len, 1000); return retval < 0 ? retval : 0; } static void catc_ctrl_run(struct catc *catc) { struct ctrl_queue *q = catc->ctrl_queue + catc->ctrl_tail; struct usb_device *usbdev = catc->usbdev; struct urb *urb = catc->ctrl_urb; struct usb_ctrlrequest *dr = &catc->ctrl_dr; int status; dr->bRequest = q->request; dr->bRequestType = 0x40 | q->dir; dr->wValue = cpu_to_le16(q->value); dr->wIndex = cpu_to_le16(q->index); dr->wLength = cpu_to_le16(q->len); urb->pipe = q->dir ? usb_rcvctrlpipe(usbdev, 0) : usb_sndctrlpipe(usbdev, 0); urb->transfer_buffer_length = q->len; urb->transfer_buffer = catc->ctrl_buf; urb->setup_packet = (void *) dr; urb->dev = usbdev; if (!q->dir && q->buf && q->len) memcpy(catc->ctrl_buf, q->buf, q->len); if ((status = usb_submit_urb(catc->ctrl_urb, GFP_ATOMIC))) dev_err(&catc->usbdev->dev, "submit(ctrl_urb) status %d\n", status); } static void catc_ctrl_done(struct urb *urb) { struct catc *catc = urb->context; struct ctrl_queue *q; unsigned long flags; int status = urb->status; if (status) dev_dbg(&urb->dev->dev, "ctrl_done, status %d, len %d.\n", status, urb->actual_length); spin_lock_irqsave(&catc->ctrl_lock, flags); q = catc->ctrl_queue + catc->ctrl_tail; if (q->dir) { if (q->buf && q->len) memcpy(q->buf, catc->ctrl_buf, q->len); else q->buf = catc->ctrl_buf; } if (q->callback) q->callback(catc, q); catc->ctrl_tail = (catc->ctrl_tail + 1) & (CTRL_QUEUE - 1); if (catc->ctrl_head != catc->ctrl_tail) catc_ctrl_run(catc); else clear_bit(CTRL_RUNNING, &catc->flags); spin_unlock_irqrestore(&catc->ctrl_lock, flags); } static int catc_ctrl_async(struct catc *catc, u8 dir, u8 request, u16 value, u16 index, void *buf, int len, void (*callback)(struct catc *catc, struct ctrl_queue *q)) { struct ctrl_queue *q; int retval = 0; unsigned long flags; spin_lock_irqsave(&catc->ctrl_lock, flags); q = catc->ctrl_queue + catc->ctrl_head; q->dir = dir; q->request = request; q->value = value; q->index = index; q->buf = buf; q->len = len; q->callback = callback; catc->ctrl_head = (catc->ctrl_head + 1) & (CTRL_QUEUE - 1); if (catc->ctrl_head == catc->ctrl_tail) { dev_err(&catc->usbdev->dev, "ctrl queue full\n"); catc->ctrl_tail = (catc->ctrl_tail + 1) & (CTRL_QUEUE - 1); retval = -1; } if (!test_and_set_bit(CTRL_RUNNING, &catc->flags)) catc_ctrl_run(catc); spin_unlock_irqrestore(&catc->ctrl_lock, flags); return retval; } /* * Statistics. */ static void catc_stats_done(struct catc *catc, struct ctrl_queue *q) { int index = q->index - EthStats; u16 data, last; catc->stats_buf[index] = *((char *)q->buf); if (index & 1) return; data = ((u16)catc->stats_buf[index] << 8) | catc->stats_buf[index + 1]; last = catc->stats_vals[index >> 1]; switch (index) { case TxSingleColl: case TxMultiColl: catc->netdev->stats.collisions += data - last; break; case TxExcessColl: catc->netdev->stats.tx_aborted_errors += data - last; catc->netdev->stats.tx_errors += data - last; break; case RxFramErr: catc->netdev->stats.rx_frame_errors += data - last; catc->netdev->stats.rx_errors += data - last; break; } catc->stats_vals[index >> 1] = data; } static void catc_stats_timer(struct timer_list *t) { struct catc *catc = timer_container_of(catc, t, timer); int i; for (i = 0; i < 8; i++) catc_get_reg_async(catc, EthStats + 7 - i, catc_stats_done); mod_timer(&catc->timer, jiffies + STATS_UPDATE); } /* * Receive modes. Broadcast, Multicast, Promisc. */ static void catc_multicast(const unsigned char *addr, u8 *multicast) { u32 crc; crc = ether_crc_le(6, addr); multicast[(crc >> 3) & 0x3f] |= 1 << (crc & 7); } static void catc_set_multicast_list(struct net_device *netdev) { struct catc *catc = netdev_priv(netdev); struct netdev_hw_addr *ha; u8 broadcast[ETH_ALEN]; u8 rx = RxEnable | RxPolarity | RxMultiCast; eth_broadcast_addr(broadcast); memset(catc->multicast, 0, 64); catc_multicast(broadcast, catc->multicast); catc_multicast(netdev->dev_addr, catc->multicast); if (netdev->flags & IFF_PROMISC) { memset(catc->multicast, 0xff, 64); rx |= (!catc->is_f5u011) ? RxPromisc : AltRxPromisc; } if (netdev->flags & IFF_ALLMULTI) { memset(catc->multicast, 0xff, 64); } else { netdev_for_each_mc_addr(ha, netdev) { u32 crc = ether_crc_le(6, ha->addr); if (!catc->is_f5u011) { catc->multicast[(crc >> 3) & 0x3f] |= 1 << (crc & 7); } else { catc->multicast[7-(crc >> 29)] |= 1 << ((crc >> 26) & 7); } } } if (!catc->is_f5u011) { catc_set_reg_async(catc, RxUnit, rx); catc_write_mem_async(catc, 0xfa80, catc->multicast, 64); } else { f5u011_mchash_async(catc, catc->multicast); if (catc->rxmode[0] != rx) { catc->rxmode[0] = rx; netdev_dbg(catc->netdev, "Setting RX mode to %2.2X %2.2X\n", catc->rxmode[0], catc->rxmode[1]); f5u011_rxmode_async(catc, catc->rxmode); } } } static void catc_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { struct catc *catc = netdev_priv(dev); strscpy(info->driver, driver_name, sizeof(info->driver)); strscpy(info->version, DRIVER_VERSION, sizeof(info->version)); usb_make_path(catc->usbdev, info->bus_info, sizeof(info->bus_info)); } static int catc_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { struct catc *catc = netdev_priv(dev); if (!catc->is_f5u011) return -EOPNOTSUPP; ethtool_link_ksettings_zero_link_mode(cmd, supported); ethtool_link_ksettings_add_link_mode(cmd, supported, 10baseT_Half); ethtool_link_ksettings_add_link_mode(cmd, supported, TP); ethtool_link_ksettings_zero_link_mode(cmd, advertising); ethtool_link_ksettings_add_link_mode(cmd, advertising, 10baseT_Half); ethtool_link_ksettings_add_link_mode(cmd, advertising, TP); cmd->base.speed = SPEED_10; cmd->base.duplex = DUPLEX_HALF; cmd->base.port = PORT_TP; cmd->base.phy_address = 0; cmd->base.autoneg = AUTONEG_DISABLE; return 0; } static const struct ethtool_ops ops = { .get_drvinfo = catc_get_drvinfo, .get_link = ethtool_op_get_link, .get_link_ksettings = catc_get_link_ksettings, }; /* * Open, close. */ static int catc_open(struct net_device *netdev) { struct catc *catc = netdev_priv(netdev); int status; catc->irq_urb->dev = catc->usbdev; if ((status = usb_submit_urb(catc->irq_urb, GFP_KERNEL)) < 0) { dev_err(&catc->usbdev->dev, "submit(irq_urb) status %d\n", status); return -1; } netif_start_queue(netdev); if (!catc->is_f5u011) mod_timer(&catc->timer, jiffies + STATS_UPDATE); return 0; } static int catc_stop(struct net_device *netdev) { struct catc *catc = netdev_priv(netdev); netif_stop_queue(netdev); if (!catc->is_f5u011) timer_delete_sync(&catc->timer); usb_kill_urb(catc->rx_urb); usb_kill_urb(catc->tx_urb); usb_kill_urb(catc->irq_urb); usb_kill_urb(catc->ctrl_urb); return 0; } static const struct net_device_ops catc_netdev_ops = { .ndo_open = catc_open, .ndo_stop = catc_stop, .ndo_start_xmit = catc_start_xmit, .ndo_tx_timeout = catc_tx_timeout, .ndo_set_rx_mode = catc_set_multicast_list, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; /* * USB probe, disconnect. */ static int catc_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct device *dev = &intf->dev; struct usb_device *usbdev = interface_to_usbdev(intf); struct net_device *netdev; struct catc *catc; u8 broadcast[ETH_ALEN]; u8 *macbuf; int pktsz, ret = -ENOMEM; macbuf = kmalloc(ETH_ALEN, GFP_KERNEL); if (!macbuf) goto error; if (usb_set_interface(usbdev, intf->altsetting->desc.bInterfaceNumber, 1)) { dev_err(dev, "Can't set altsetting 1.\n"); ret = -EIO; goto fail_mem; } netdev = alloc_etherdev(sizeof(struct catc)); if (!netdev) goto fail_mem; catc = netdev_priv(netdev); netdev->netdev_ops = &catc_netdev_ops; netdev->watchdog_timeo = TX_TIMEOUT; netdev->ethtool_ops = &ops; catc->usbdev = usbdev; catc->netdev = netdev; spin_lock_init(&catc->tx_lock); spin_lock_init(&catc->ctrl_lock); timer_setup(&catc->timer, catc_stats_timer, 0); catc->ctrl_urb = usb_alloc_urb(0, GFP_KERNEL); catc->tx_urb = usb_alloc_urb(0, GFP_KERNEL); catc->rx_urb = usb_alloc_urb(0, GFP_KERNEL); catc->irq_urb = usb_alloc_urb(0, GFP_KERNEL); if ((!catc->ctrl_urb) || (!catc->tx_urb) || (!catc->rx_urb) || (!catc->irq_urb)) { dev_err(&intf->dev, "No free urbs available.\n"); ret = -ENOMEM; goto fail_free; } /* The F5U011 has the same vendor/product as the netmate but a device version of 0x130 */ if (le16_to_cpu(usbdev->descriptor.idVendor) == 0x0423 && le16_to_cpu(usbdev->descriptor.idProduct) == 0xa && le16_to_cpu(catc->usbdev->descriptor.bcdDevice) == 0x0130) { dev_dbg(dev, "Testing for f5u011\n"); catc->is_f5u011 = 1; atomic_set(&catc->recq_sz, 0); pktsz = RX_PKT_SZ; } else { pktsz = RX_MAX_BURST * (PKT_SZ + 2); } usb_fill_control_urb(catc->ctrl_urb, usbdev, usb_sndctrlpipe(usbdev, 0), NULL, NULL, 0, catc_ctrl_done, catc); usb_fill_bulk_urb(catc->tx_urb, usbdev, usb_sndbulkpipe(usbdev, 1), NULL, 0, catc_tx_done, catc); usb_fill_bulk_urb(catc->rx_urb, usbdev, usb_rcvbulkpipe(usbdev, 1), catc->rx_buf, pktsz, catc_rx_done, catc); usb_fill_int_urb(catc->irq_urb, usbdev, usb_rcvintpipe(usbdev, 2), catc->irq_buf, 2, catc_irq_done, catc, 1); if (!catc->is_f5u011) { u32 *buf; int i; dev_dbg(dev, "Checking memory size\n"); buf = kmalloc(4, GFP_KERNEL); if (!buf) { ret = -ENOMEM; goto fail_free; } *buf = 0x12345678; catc_write_mem(catc, 0x7a80, buf, 4); *buf = 0x87654321; catc_write_mem(catc, 0xfa80, buf, 4); catc_read_mem(catc, 0x7a80, buf, 4); switch (*buf) { case 0x12345678: catc_set_reg(catc, TxBufCount, 8); catc_set_reg(catc, RxBufCount, 32); dev_dbg(dev, "64k Memory\n"); break; default: dev_warn(&intf->dev, "Couldn't detect memory size, assuming 32k\n"); fallthrough; case 0x87654321: catc_set_reg(catc, TxBufCount, 4); catc_set_reg(catc, RxBufCount, 16); dev_dbg(dev, "32k Memory\n"); break; } kfree(buf); dev_dbg(dev, "Getting MAC from SEEROM.\n"); catc_get_mac(catc, macbuf); eth_hw_addr_set(netdev, macbuf); dev_dbg(dev, "Setting MAC into registers.\n"); for (i = 0; i < 6; i++) catc_set_reg(catc, StationAddr0 - i, netdev->dev_addr[i]); dev_dbg(dev, "Filling the multicast list.\n"); eth_broadcast_addr(broadcast); catc_multicast(broadcast, catc->multicast); catc_multicast(netdev->dev_addr, catc->multicast); catc_write_mem(catc, 0xfa80, catc->multicast, 64); dev_dbg(dev, "Clearing error counters.\n"); for (i = 0; i < 8; i++) catc_set_reg(catc, EthStats + i, 0); catc->last_stats = jiffies; dev_dbg(dev, "Enabling.\n"); catc_set_reg(catc, MaxBurst, RX_MAX_BURST); catc_set_reg(catc, OpModes, OpTxMerge | OpRxMerge | OpLenInclude | Op3MemWaits); catc_set_reg(catc, LEDCtrl, LEDLink); catc_set_reg(catc, RxUnit, RxEnable | RxPolarity | RxMultiCast); } else { dev_dbg(dev, "Performing reset\n"); catc_reset(catc); catc_get_mac(catc, macbuf); eth_hw_addr_set(netdev, macbuf); dev_dbg(dev, "Setting RX Mode\n"); catc->rxmode[0] = RxEnable | RxPolarity | RxMultiCast; catc->rxmode[1] = 0; f5u011_rxmode(catc, catc->rxmode); } dev_dbg(dev, "Init done.\n"); printk(KERN_INFO "%s: %s USB Ethernet at usb-%s-%s, %pM.\n", netdev->name, (catc->is_f5u011) ? "Belkin F5U011" : "CATC EL1210A NetMate", usbdev->bus->bus_name, usbdev->devpath, netdev->dev_addr); usb_set_intfdata(intf, catc); SET_NETDEV_DEV(netdev, &intf->dev); ret = register_netdev(netdev); if (ret) goto fail_clear_intfdata; kfree(macbuf); return 0; fail_clear_intfdata: usb_set_intfdata(intf, NULL); fail_free: usb_free_urb(catc->ctrl_urb); usb_free_urb(catc->tx_urb); usb_free_urb(catc->rx_urb); usb_free_urb(catc->irq_urb); free_netdev(netdev); fail_mem: kfree(macbuf); error: return ret; } static void catc_disconnect(struct usb_interface *intf) { struct catc *catc = usb_get_intfdata(intf); usb_set_intfdata(intf, NULL); if (catc) { unregister_netdev(catc->netdev); usb_free_urb(catc->ctrl_urb); usb_free_urb(catc->tx_urb); usb_free_urb(catc->rx_urb); usb_free_urb(catc->irq_urb); free_netdev(catc->netdev); } } /* * Module functions and tables. */ static const struct usb_device_id catc_id_table[] = { { USB_DEVICE(0x0423, 0xa) }, /* CATC Netmate, Belkin F5U011 */ { USB_DEVICE(0x0423, 0xc) }, /* CATC Netmate II, Belkin F5U111 */ { USB_DEVICE(0x08d1, 0x1) }, /* smartBridges smartNIC */ { } }; MODULE_DEVICE_TABLE(usb, catc_id_table); static struct usb_driver catc_driver = { .name = driver_name, .probe = catc_probe, .disconnect = catc_disconnect, .id_table = catc_id_table, .disable_hub_initiated_lpm = 1, }; module_usb_driver(catc_driver);
7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 /* SPDX-License-Identifier: GPL-2.0-only */ /* * File: pep.h * * Phonet Pipe End Point sockets definitions * * Copyright (C) 2008 Nokia Corporation. */ #ifndef NET_PHONET_PEP_H #define NET_PHONET_PEP_H #include <linux/skbuff.h> #include <net/phonet/phonet.h> struct pep_sock { struct pn_sock pn_sk; /* XXX: union-ify listening vs connected stuff ? */ /* Listening socket stuff: */ struct hlist_head hlist; /* Connected socket stuff: */ struct sock *listener; struct sk_buff_head ctrlreq_queue; #define PNPIPE_CTRLREQ_MAX 10 atomic_t tx_credits; int ifindex; u16 peer_type; /* peer type/subtype */ u8 pipe_handle; u8 rx_credits; u8 rx_fc; /* RX flow control */ u8 tx_fc; /* TX flow control */ u8 init_enable; /* auto-enable at creation */ u8 aligned; }; static inline struct pep_sock *pep_sk(struct sock *sk) { return (struct pep_sock *)sk; } extern const struct proto_ops phonet_stream_ops; /* Pipe protocol definitions */ struct pnpipehdr { u8 utid; /* transaction ID */ u8 message_id; u8 pipe_handle; union { u8 state_after_connect; /* connect request */ u8 state_after_reset; /* reset request */ u8 error_code; /* any response */ u8 pep_type; /* status indication */ u8 data0; /* anything else */ }; u8 data[]; }; #define other_pep_type data[0] static inline struct pnpipehdr *pnp_hdr(struct sk_buff *skb) { return (struct pnpipehdr *)skb_transport_header(skb); } #define MAX_PNPIPE_HEADER (MAX_PHONET_HEADER + 4) enum { PNS_PIPE_CREATE_REQ = 0x00, PNS_PIPE_CREATE_RESP, PNS_PIPE_REMOVE_REQ, PNS_PIPE_REMOVE_RESP, PNS_PIPE_DATA = 0x20, PNS_PIPE_ALIGNED_DATA, PNS_PEP_CONNECT_REQ = 0x40, PNS_PEP_CONNECT_RESP, PNS_PEP_DISCONNECT_REQ, PNS_PEP_DISCONNECT_RESP, PNS_PEP_RESET_REQ, PNS_PEP_RESET_RESP, PNS_PEP_ENABLE_REQ, PNS_PEP_ENABLE_RESP, PNS_PEP_CTRL_REQ, PNS_PEP_CTRL_RESP, PNS_PEP_DISABLE_REQ = 0x4C, PNS_PEP_DISABLE_RESP, PNS_PEP_STATUS_IND = 0x60, PNS_PIPE_CREATED_IND, PNS_PIPE_RESET_IND = 0x63, PNS_PIPE_ENABLED_IND, PNS_PIPE_REDIRECTED_IND, PNS_PIPE_DISABLED_IND = 0x66, }; #define PN_PIPE_INVALID_HANDLE 0xff #define PN_PEP_TYPE_COMMON 0x00 /* Phonet pipe status indication */ enum { PN_PEP_IND_FLOW_CONTROL, PN_PEP_IND_ID_MCFC_GRANT_CREDITS, }; /* Phonet pipe error codes */ enum { PN_PIPE_NO_ERROR, PN_PIPE_ERR_INVALID_PARAM, PN_PIPE_ERR_INVALID_HANDLE, PN_PIPE_ERR_INVALID_CTRL_ID, PN_PIPE_ERR_NOT_ALLOWED, PN_PIPE_ERR_PEP_IN_USE, PN_PIPE_ERR_OVERLOAD, PN_PIPE_ERR_DEV_DISCONNECTED, PN_PIPE_ERR_TIMEOUT, PN_PIPE_ERR_ALL_PIPES_IN_USE, PN_PIPE_ERR_GENERAL, PN_PIPE_ERR_NOT_SUPPORTED, }; /* Phonet pipe states */ enum { PN_PIPE_DISABLE, PN_PIPE_ENABLE, }; /* Phonet pipe sub-block types */ enum { PN_PIPE_SB_CREATE_REQ_PEP_SUB_TYPE, PN_PIPE_SB_CONNECT_REQ_PEP_SUB_TYPE, PN_PIPE_SB_REDIRECT_REQ_PEP_SUB_TYPE, PN_PIPE_SB_NEGOTIATED_FC, PN_PIPE_SB_REQUIRED_FC_TX, PN_PIPE_SB_PREFERRED_FC_RX, PN_PIPE_SB_ALIGNED_DATA, }; /* Phonet pipe flow control models */ enum { PN_NO_FLOW_CONTROL, PN_LEGACY_FLOW_CONTROL, PN_ONE_CREDIT_FLOW_CONTROL, PN_MULTI_CREDIT_FLOW_CONTROL, PN_MAX_FLOW_CONTROL, }; #define pn_flow_safe(fc) ((fc) >> 1) /* Phonet pipe flow control states */ enum { PEP_IND_EMPTY, PEP_IND_BUSY, PEP_IND_READY, }; #endif
26 12 24 24 12 24 7 29 10 8 13 12 7 5 7 7 4 7 12 22 22 2 2 20 9 3 3 22 22 19 19 18 19 18 14 14 14 19 248 248 5 5 5 5 10 5 4 1 4 1 11 35 15 14 35 34 34 34 34 34 34 34 35 41 40 40 9 5 9 8 8 4 3 35 34 32 31 4 15 29 20 11 12 40 22 22 24 4 4 1 4 3 3 3 2 1 1 2 6 6 2 1 4 4 1 2 4 2 1 2 4 4 4 4 2 1 1 4 2 2 1 44 44 2 2 43 42 41 41 12 29 27 27 27 6 5 1 4 26 23 11 10 9 9 7 7 1 7 6 7 7 5 23 21 22 22 6 22 7 53 53 51 52 6 4 44 6 5 6 3 6 3 3 1 3 3 1 3 3 1 3 6 6 6 6 6 3 6 6 6 6 3 3 3 6 79 79 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 // SPDX-License-Identifier: GPL-2.0-or-later /* * ip6_flowlabel.c IPv6 flowlabel manager. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/pid_namespace.h> #include <linux/jump_label_ratelimit.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/ipv6.h> #include <net/rawv6.h> #include <net/transp_v6.h> #include <linux/uaccess.h> #define FL_MIN_LINGER 6 /* Minimal linger. It is set to 6sec specified in old IPv6 RFC. Well, it was reasonable value. */ #define FL_MAX_LINGER 150 /* Maximal linger timeout */ /* FL hash table */ #define FL_MAX_PER_SOCK 32 #define FL_MAX_SIZE 4096 #define FL_HASH_MASK 255 #define FL_HASH(l) (ntohl(l)&FL_HASH_MASK) static atomic_t fl_size = ATOMIC_INIT(0); static struct ip6_flowlabel __rcu *fl_ht[FL_HASH_MASK+1]; static void ip6_fl_gc(struct timer_list *unused); static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc); /* FL hash table lock: it protects only of GC */ static DEFINE_SPINLOCK(ip6_fl_lock); /* Big socket sock */ static DEFINE_SPINLOCK(ip6_sk_fl_lock); DEFINE_STATIC_KEY_DEFERRED_FALSE(ipv6_flowlabel_exclusive, HZ); EXPORT_SYMBOL(ipv6_flowlabel_exclusive); #define for_each_fl_rcu(hash, fl) \ for (fl = rcu_dereference(fl_ht[(hash)]); \ fl != NULL; \ fl = rcu_dereference(fl->next)) #define for_each_fl_continue_rcu(fl) \ for (fl = rcu_dereference(fl->next); \ fl != NULL; \ fl = rcu_dereference(fl->next)) #define for_each_sk_fl_rcu(np, sfl) \ for (sfl = rcu_dereference(np->ipv6_fl_list); \ sfl != NULL; \ sfl = rcu_dereference(sfl->next)) static inline struct ip6_flowlabel *__fl_lookup(struct net *net, __be32 label) { struct ip6_flowlabel *fl; for_each_fl_rcu(FL_HASH(label), fl) { if (fl->label == label && net_eq(fl->fl_net, net)) return fl; } return NULL; } static struct ip6_flowlabel *fl_lookup(struct net *net, __be32 label) { struct ip6_flowlabel *fl; rcu_read_lock(); fl = __fl_lookup(net, label); if (fl && !atomic_inc_not_zero(&fl->users)) fl = NULL; rcu_read_unlock(); return fl; } static bool fl_shared_exclusive(struct ip6_flowlabel *fl) { return fl->share == IPV6_FL_S_EXCL || fl->share == IPV6_FL_S_PROCESS || fl->share == IPV6_FL_S_USER; } static void fl_free_rcu(struct rcu_head *head) { struct ip6_flowlabel *fl = container_of(head, struct ip6_flowlabel, rcu); if (fl->share == IPV6_FL_S_PROCESS) put_pid(fl->owner.pid); kfree(fl->opt); kfree(fl); } static void fl_free(struct ip6_flowlabel *fl) { if (!fl) return; if (fl_shared_exclusive(fl) || fl->opt) static_branch_slow_dec_deferred(&ipv6_flowlabel_exclusive); call_rcu(&fl->rcu, fl_free_rcu); } static void fl_release(struct ip6_flowlabel *fl) { spin_lock_bh(&ip6_fl_lock); fl->lastuse = jiffies; if (atomic_dec_and_test(&fl->users)) { unsigned long ttd = fl->lastuse + fl->linger; if (time_after(ttd, fl->expires)) fl->expires = ttd; ttd = fl->expires; if (fl->opt && fl->share == IPV6_FL_S_EXCL) { struct ipv6_txoptions *opt = fl->opt; fl->opt = NULL; kfree(opt); } if (!timer_pending(&ip6_fl_gc_timer) || time_after(ip6_fl_gc_timer.expires, ttd)) mod_timer(&ip6_fl_gc_timer, ttd); } spin_unlock_bh(&ip6_fl_lock); } static void ip6_fl_gc(struct timer_list *unused) { int i; unsigned long now = jiffies; unsigned long sched = 0; spin_lock(&ip6_fl_lock); for (i = 0; i <= FL_HASH_MASK; i++) { struct ip6_flowlabel *fl; struct ip6_flowlabel __rcu **flp; flp = &fl_ht[i]; while ((fl = rcu_dereference_protected(*flp, lockdep_is_held(&ip6_fl_lock))) != NULL) { if (atomic_read(&fl->users) == 0) { unsigned long ttd = fl->lastuse + fl->linger; if (time_after(ttd, fl->expires)) fl->expires = ttd; ttd = fl->expires; if (time_after_eq(now, ttd)) { *flp = fl->next; fl_free(fl); atomic_dec(&fl_size); continue; } if (!sched || time_before(ttd, sched)) sched = ttd; } flp = &fl->next; } } if (!sched && atomic_read(&fl_size)) sched = now + FL_MAX_LINGER; if (sched) { mod_timer(&ip6_fl_gc_timer, sched); } spin_unlock(&ip6_fl_lock); } static void __net_exit ip6_fl_purge(struct net *net) { int i; spin_lock_bh(&ip6_fl_lock); for (i = 0; i <= FL_HASH_MASK; i++) { struct ip6_flowlabel *fl; struct ip6_flowlabel __rcu **flp; flp = &fl_ht[i]; while ((fl = rcu_dereference_protected(*flp, lockdep_is_held(&ip6_fl_lock))) != NULL) { if (net_eq(fl->fl_net, net) && atomic_read(&fl->users) == 0) { *flp = fl->next; fl_free(fl); atomic_dec(&fl_size); continue; } flp = &fl->next; } } spin_unlock_bh(&ip6_fl_lock); } static struct ip6_flowlabel *fl_intern(struct net *net, struct ip6_flowlabel *fl, __be32 label) { struct ip6_flowlabel *lfl; fl->label = label & IPV6_FLOWLABEL_MASK; rcu_read_lock(); spin_lock_bh(&ip6_fl_lock); if (label == 0) { for (;;) { fl->label = htonl(get_random_u32())&IPV6_FLOWLABEL_MASK; if (fl->label) { lfl = __fl_lookup(net, fl->label); if (!lfl) break; } } } else { /* * we dropper the ip6_fl_lock, so this entry could reappear * and we need to recheck with it. * * OTOH no need to search the active socket first, like it is * done in ipv6_flowlabel_opt - sock is locked, so new entry * with the same label can only appear on another sock */ lfl = __fl_lookup(net, fl->label); if (lfl) { atomic_inc(&lfl->users); spin_unlock_bh(&ip6_fl_lock); rcu_read_unlock(); return lfl; } } fl->lastuse = jiffies; fl->next = fl_ht[FL_HASH(fl->label)]; rcu_assign_pointer(fl_ht[FL_HASH(fl->label)], fl); atomic_inc(&fl_size); spin_unlock_bh(&ip6_fl_lock); rcu_read_unlock(); return NULL; } /* Socket flowlabel lists */ struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label) { struct ipv6_fl_socklist *sfl; struct ipv6_pinfo *np = inet6_sk(sk); label &= IPV6_FLOWLABEL_MASK; rcu_read_lock(); for_each_sk_fl_rcu(np, sfl) { struct ip6_flowlabel *fl = sfl->fl; if (fl->label == label && atomic_inc_not_zero(&fl->users)) { fl->lastuse = jiffies; rcu_read_unlock(); return fl; } } rcu_read_unlock(); return NULL; } EXPORT_SYMBOL_GPL(__fl6_sock_lookup); void fl6_free_socklist(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_fl_socklist *sfl; if (!rcu_access_pointer(np->ipv6_fl_list)) return; spin_lock_bh(&ip6_sk_fl_lock); while ((sfl = rcu_dereference_protected(np->ipv6_fl_list, lockdep_is_held(&ip6_sk_fl_lock))) != NULL) { np->ipv6_fl_list = sfl->next; spin_unlock_bh(&ip6_sk_fl_lock); fl_release(sfl->fl); kfree_rcu(sfl, rcu); spin_lock_bh(&ip6_sk_fl_lock); } spin_unlock_bh(&ip6_sk_fl_lock); } /* Service routines */ /* It is the only difficult place. flowlabel enforces equal headers before and including routing header, however user may supply options following rthdr. */ struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space, struct ip6_flowlabel *fl, struct ipv6_txoptions *fopt) { struct ipv6_txoptions *fl_opt = fl->opt; if (!fopt || fopt->opt_flen == 0) return fl_opt; if (fl_opt) { opt_space->hopopt = fl_opt->hopopt; opt_space->dst0opt = fl_opt->dst0opt; opt_space->srcrt = fl_opt->srcrt; opt_space->opt_nflen = fl_opt->opt_nflen; } else { if (fopt->opt_nflen == 0) return fopt; opt_space->hopopt = NULL; opt_space->dst0opt = NULL; opt_space->srcrt = NULL; opt_space->opt_nflen = 0; } opt_space->dst1opt = fopt->dst1opt; opt_space->opt_flen = fopt->opt_flen; opt_space->tot_len = fopt->tot_len; return opt_space; } EXPORT_SYMBOL_GPL(fl6_merge_options); static unsigned long check_linger(unsigned long ttl) { if (ttl < FL_MIN_LINGER) return FL_MIN_LINGER*HZ; if (ttl > FL_MAX_LINGER && !capable(CAP_NET_ADMIN)) return 0; return ttl*HZ; } static int fl6_renew(struct ip6_flowlabel *fl, unsigned long linger, unsigned long expires) { linger = check_linger(linger); if (!linger) return -EPERM; expires = check_linger(expires); if (!expires) return -EPERM; spin_lock_bh(&ip6_fl_lock); fl->lastuse = jiffies; if (time_before(fl->linger, linger)) fl->linger = linger; if (time_before(expires, fl->linger)) expires = fl->linger; if (time_before(fl->expires, fl->lastuse + expires)) fl->expires = fl->lastuse + expires; spin_unlock_bh(&ip6_fl_lock); return 0; } static struct ip6_flowlabel * fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, sockptr_t optval, int optlen, int *err_p) { struct ip6_flowlabel *fl = NULL; int olen; int addr_type; int err; olen = optlen - CMSG_ALIGN(sizeof(*freq)); err = -EINVAL; if (olen > 64 * 1024) goto done; err = -ENOMEM; fl = kzalloc(sizeof(*fl), GFP_KERNEL); if (!fl) goto done; if (olen > 0) { struct msghdr msg; struct flowi6 flowi6; struct ipcm6_cookie ipc6; err = -ENOMEM; fl->opt = kmalloc(sizeof(*fl->opt) + olen, GFP_KERNEL); if (!fl->opt) goto done; memset(fl->opt, 0, sizeof(*fl->opt)); fl->opt->tot_len = sizeof(*fl->opt) + olen; err = -EFAULT; if (copy_from_sockptr_offset(fl->opt + 1, optval, CMSG_ALIGN(sizeof(*freq)), olen)) goto done; msg.msg_controllen = olen; msg.msg_control = (void *)(fl->opt+1); memset(&flowi6, 0, sizeof(flowi6)); ipc6.opt = fl->opt; err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, &ipc6); if (err) goto done; err = -EINVAL; if (fl->opt->opt_flen) goto done; if (fl->opt->opt_nflen == 0) { kfree(fl->opt); fl->opt = NULL; } } fl->fl_net = net; fl->expires = jiffies; err = fl6_renew(fl, freq->flr_linger, freq->flr_expires); if (err) goto done; fl->share = freq->flr_share; addr_type = ipv6_addr_type(&freq->flr_dst); if ((addr_type & IPV6_ADDR_MAPPED) || addr_type == IPV6_ADDR_ANY) { err = -EINVAL; goto done; } fl->dst = freq->flr_dst; atomic_set(&fl->users, 1); switch (fl->share) { case IPV6_FL_S_EXCL: case IPV6_FL_S_ANY: break; case IPV6_FL_S_PROCESS: fl->owner.pid = get_task_pid(current, PIDTYPE_PID); break; case IPV6_FL_S_USER: fl->owner.uid = current_euid(); break; default: err = -EINVAL; goto done; } if (fl_shared_exclusive(fl) || fl->opt) { WRITE_ONCE(sock_net(sk)->ipv6.flowlabel_has_excl, 1); static_branch_deferred_inc(&ipv6_flowlabel_exclusive); } return fl; done: if (fl) { kfree(fl->opt); kfree(fl); } *err_p = err; return NULL; } static int mem_check(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_fl_socklist *sfl; int room = FL_MAX_SIZE - atomic_read(&fl_size); int count = 0; if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK) return 0; rcu_read_lock(); for_each_sk_fl_rcu(np, sfl) count++; rcu_read_unlock(); if (room <= 0 || ((count >= FL_MAX_PER_SOCK || (count > 0 && room < FL_MAX_SIZE/2) || room < FL_MAX_SIZE/4) && !capable(CAP_NET_ADMIN))) return -ENOBUFS; return 0; } static inline void fl_link(struct ipv6_pinfo *np, struct ipv6_fl_socklist *sfl, struct ip6_flowlabel *fl) { spin_lock_bh(&ip6_sk_fl_lock); sfl->fl = fl; sfl->next = np->ipv6_fl_list; rcu_assign_pointer(np->ipv6_fl_list, sfl); spin_unlock_bh(&ip6_sk_fl_lock); } int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, int flags) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_fl_socklist *sfl; if (flags & IPV6_FL_F_REMOTE) { freq->flr_label = np->rcv_flowinfo & IPV6_FLOWLABEL_MASK; return 0; } if (inet6_test_bit(REPFLOW, sk)) { freq->flr_label = np->flow_label; return 0; } rcu_read_lock(); for_each_sk_fl_rcu(np, sfl) { if (sfl->fl->label == (np->flow_label & IPV6_FLOWLABEL_MASK)) { spin_lock_bh(&ip6_fl_lock); freq->flr_label = sfl->fl->label; freq->flr_dst = sfl->fl->dst; freq->flr_share = sfl->fl->share; freq->flr_expires = (sfl->fl->expires - jiffies) / HZ; freq->flr_linger = sfl->fl->linger / HZ; spin_unlock_bh(&ip6_fl_lock); rcu_read_unlock(); return 0; } } rcu_read_unlock(); return -ENOENT; } #define socklist_dereference(__sflp) \ rcu_dereference_protected(__sflp, lockdep_is_held(&ip6_sk_fl_lock)) static int ipv6_flowlabel_put(struct sock *sk, struct in6_flowlabel_req *freq) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_fl_socklist __rcu **sflp; struct ipv6_fl_socklist *sfl; if (freq->flr_flags & IPV6_FL_F_REFLECT) { if (sk->sk_protocol != IPPROTO_TCP) return -ENOPROTOOPT; if (!inet6_test_bit(REPFLOW, sk)) return -ESRCH; np->flow_label = 0; inet6_clear_bit(REPFLOW, sk); return 0; } spin_lock_bh(&ip6_sk_fl_lock); for (sflp = &np->ipv6_fl_list; (sfl = socklist_dereference(*sflp)) != NULL; sflp = &sfl->next) { if (sfl->fl->label == freq->flr_label) goto found; } spin_unlock_bh(&ip6_sk_fl_lock); return -ESRCH; found: if (freq->flr_label == (np->flow_label & IPV6_FLOWLABEL_MASK)) np->flow_label &= ~IPV6_FLOWLABEL_MASK; *sflp = sfl->next; spin_unlock_bh(&ip6_sk_fl_lock); fl_release(sfl->fl); kfree_rcu(sfl, rcu); return 0; } static int ipv6_flowlabel_renew(struct sock *sk, struct in6_flowlabel_req *freq) { struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); struct ipv6_fl_socklist *sfl; int err; rcu_read_lock(); for_each_sk_fl_rcu(np, sfl) { if (sfl->fl->label == freq->flr_label) { err = fl6_renew(sfl->fl, freq->flr_linger, freq->flr_expires); rcu_read_unlock(); return err; } } rcu_read_unlock(); if (freq->flr_share == IPV6_FL_S_NONE && ns_capable(net->user_ns, CAP_NET_ADMIN)) { struct ip6_flowlabel *fl = fl_lookup(net, freq->flr_label); if (fl) { err = fl6_renew(fl, freq->flr_linger, freq->flr_expires); fl_release(fl); return err; } } return -ESRCH; } static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq, sockptr_t optval, int optlen) { struct ipv6_fl_socklist *sfl, *sfl1 = NULL; struct ip6_flowlabel *fl, *fl1 = NULL; struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); int err; if (freq->flr_flags & IPV6_FL_F_REFLECT) { if (net->ipv6.sysctl.flowlabel_consistency) { net_info_ratelimited("Can not set IPV6_FL_F_REFLECT if flowlabel_consistency sysctl is enable\n"); return -EPERM; } if (sk->sk_protocol != IPPROTO_TCP) return -ENOPROTOOPT; inet6_set_bit(REPFLOW, sk); return 0; } if (freq->flr_label & ~IPV6_FLOWLABEL_MASK) return -EINVAL; if (net->ipv6.sysctl.flowlabel_state_ranges && (freq->flr_label & IPV6_FLOWLABEL_STATELESS_FLAG)) return -ERANGE; fl = fl_create(net, sk, freq, optval, optlen, &err); if (!fl) return err; sfl1 = kmalloc(sizeof(*sfl1), GFP_KERNEL); if (freq->flr_label) { err = -EEXIST; rcu_read_lock(); for_each_sk_fl_rcu(np, sfl) { if (sfl->fl->label == freq->flr_label) { if (freq->flr_flags & IPV6_FL_F_EXCL) { rcu_read_unlock(); goto done; } fl1 = sfl->fl; if (!atomic_inc_not_zero(&fl1->users)) fl1 = NULL; break; } } rcu_read_unlock(); if (!fl1) fl1 = fl_lookup(net, freq->flr_label); if (fl1) { recheck: err = -EEXIST; if (freq->flr_flags&IPV6_FL_F_EXCL) goto release; err = -EPERM; if (fl1->share == IPV6_FL_S_EXCL || fl1->share != fl->share || ((fl1->share == IPV6_FL_S_PROCESS) && (fl1->owner.pid != fl->owner.pid)) || ((fl1->share == IPV6_FL_S_USER) && !uid_eq(fl1->owner.uid, fl->owner.uid))) goto release; err = -ENOMEM; if (!sfl1) goto release; if (fl->linger > fl1->linger) fl1->linger = fl->linger; if ((long)(fl->expires - fl1->expires) > 0) fl1->expires = fl->expires; fl_link(np, sfl1, fl1); fl_free(fl); return 0; release: fl_release(fl1); goto done; } } err = -ENOENT; if (!(freq->flr_flags & IPV6_FL_F_CREATE)) goto done; err = -ENOMEM; if (!sfl1) goto done; err = mem_check(sk); if (err != 0) goto done; fl1 = fl_intern(net, fl, freq->flr_label); if (fl1) goto recheck; if (!freq->flr_label) { size_t offset = offsetof(struct in6_flowlabel_req, flr_label); if (copy_to_sockptr_offset(optval, offset, &fl->label, sizeof(fl->label))) { /* Intentionally ignore fault. */ } } fl_link(np, sfl1, fl); return 0; done: fl_free(fl); kfree(sfl1); return err; } int ipv6_flowlabel_opt(struct sock *sk, sockptr_t optval, int optlen) { struct in6_flowlabel_req freq; if (optlen < sizeof(freq)) return -EINVAL; if (copy_from_sockptr(&freq, optval, sizeof(freq))) return -EFAULT; switch (freq.flr_action) { case IPV6_FL_A_PUT: return ipv6_flowlabel_put(sk, &freq); case IPV6_FL_A_RENEW: return ipv6_flowlabel_renew(sk, &freq); case IPV6_FL_A_GET: return ipv6_flowlabel_get(sk, &freq, optval, optlen); default: return -EINVAL; } } #ifdef CONFIG_PROC_FS struct ip6fl_iter_state { struct seq_net_private p; struct pid_namespace *pid_ns; int bucket; }; #define ip6fl_seq_private(seq) ((struct ip6fl_iter_state *)(seq)->private) static struct ip6_flowlabel *ip6fl_get_first(struct seq_file *seq) { struct ip6_flowlabel *fl = NULL; struct ip6fl_iter_state *state = ip6fl_seq_private(seq); struct net *net = seq_file_net(seq); for (state->bucket = 0; state->bucket <= FL_HASH_MASK; ++state->bucket) { for_each_fl_rcu(state->bucket, fl) { if (net_eq(fl->fl_net, net)) goto out; } } fl = NULL; out: return fl; } static struct ip6_flowlabel *ip6fl_get_next(struct seq_file *seq, struct ip6_flowlabel *fl) { struct ip6fl_iter_state *state = ip6fl_seq_private(seq); struct net *net = seq_file_net(seq); for_each_fl_continue_rcu(fl) { if (net_eq(fl->fl_net, net)) goto out; } try_again: if (++state->bucket <= FL_HASH_MASK) { for_each_fl_rcu(state->bucket, fl) { if (net_eq(fl->fl_net, net)) goto out; } goto try_again; } fl = NULL; out: return fl; } static struct ip6_flowlabel *ip6fl_get_idx(struct seq_file *seq, loff_t pos) { struct ip6_flowlabel *fl = ip6fl_get_first(seq); if (fl) while (pos && (fl = ip6fl_get_next(seq, fl)) != NULL) --pos; return pos ? NULL : fl; } static void *ip6fl_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct ip6fl_iter_state *state = ip6fl_seq_private(seq); state->pid_ns = proc_pid_ns(file_inode(seq->file)->i_sb); rcu_read_lock(); return *pos ? ip6fl_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *ip6fl_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ip6_flowlabel *fl; if (v == SEQ_START_TOKEN) fl = ip6fl_get_first(seq); else fl = ip6fl_get_next(seq, v); ++*pos; return fl; } static void ip6fl_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int ip6fl_seq_show(struct seq_file *seq, void *v) { struct ip6fl_iter_state *state = ip6fl_seq_private(seq); if (v == SEQ_START_TOKEN) { seq_puts(seq, "Label S Owner Users Linger Expires Dst Opt\n"); } else { struct ip6_flowlabel *fl = v; seq_printf(seq, "%05X %-1d %-6d %-6d %-6ld %-8ld %pi6 %-4d\n", (unsigned int)ntohl(fl->label), fl->share, ((fl->share == IPV6_FL_S_PROCESS) ? pid_nr_ns(fl->owner.pid, state->pid_ns) : ((fl->share == IPV6_FL_S_USER) ? from_kuid_munged(seq_user_ns(seq), fl->owner.uid) : 0)), atomic_read(&fl->users), fl->linger/HZ, (long)(fl->expires - jiffies)/HZ, &fl->dst, fl->opt ? fl->opt->opt_nflen : 0); } return 0; } static const struct seq_operations ip6fl_seq_ops = { .start = ip6fl_seq_start, .next = ip6fl_seq_next, .stop = ip6fl_seq_stop, .show = ip6fl_seq_show, }; static int __net_init ip6_flowlabel_proc_init(struct net *net) { if (!proc_create_net("ip6_flowlabel", 0444, net->proc_net, &ip6fl_seq_ops, sizeof(struct ip6fl_iter_state))) return -ENOMEM; return 0; } static void __net_exit ip6_flowlabel_proc_fini(struct net *net) { remove_proc_entry("ip6_flowlabel", net->proc_net); } #else static inline int ip6_flowlabel_proc_init(struct net *net) { return 0; } static inline void ip6_flowlabel_proc_fini(struct net *net) { } #endif static void __net_exit ip6_flowlabel_net_exit(struct net *net) { ip6_fl_purge(net); ip6_flowlabel_proc_fini(net); } static struct pernet_operations ip6_flowlabel_net_ops = { .init = ip6_flowlabel_proc_init, .exit = ip6_flowlabel_net_exit, }; int ip6_flowlabel_init(void) { return register_pernet_subsys(&ip6_flowlabel_net_ops); } void ip6_flowlabel_cleanup(void) { static_key_deferred_flush(&ipv6_flowlabel_exclusive); timer_delete(&ip6_fl_gc_timer); unregister_pernet_subsys(&ip6_flowlabel_net_ops); }
100 101 99 101 130 39 99 99 99 10 5 1 1 1 97 97 96 97 69 97 70 66 49 48 48 48 48 48 2 97 94 97 72 9 74 4 4 33 30 29 32 33 33 30 2 32 33 33 33 13 10 13 3 10 13 13 13 19 18 14 18 19 12 12 6 6 2 2 40 5 35 3 1 3 39 39 39 39 39 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 // SPDX-License-Identifier: GPL-2.0-or-later /* * algif_hash: User-space interface for hash algorithms * * This file provides the user-space API for hash algorithms. * * Copyright (c) 2010 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/hash.h> #include <crypto/if_alg.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/net.h> #include <net/sock.h> struct hash_ctx { struct af_alg_sgl sgl; u8 *result; struct crypto_wait wait; unsigned int len; bool more; struct ahash_request req; }; static int hash_alloc_result(struct sock *sk, struct hash_ctx *ctx) { unsigned ds; if (ctx->result) return 0; ds = crypto_ahash_digestsize(crypto_ahash_reqtfm(&ctx->req)); ctx->result = sock_kmalloc(sk, ds, GFP_KERNEL); if (!ctx->result) return -ENOMEM; memset(ctx->result, 0, ds); return 0; } static void hash_free_result(struct sock *sk, struct hash_ctx *ctx) { unsigned ds; if (!ctx->result) return; ds = crypto_ahash_digestsize(crypto_ahash_reqtfm(&ctx->req)); sock_kzfree_s(sk, ctx->result, ds); ctx->result = NULL; } static int hash_sendmsg(struct socket *sock, struct msghdr *msg, size_t ignored) { struct sock *sk = sock->sk; struct alg_sock *ask = alg_sk(sk); struct hash_ctx *ctx = ask->private; ssize_t copied = 0; size_t len, max_pages, npages; bool continuing, need_init = false; int err; max_pages = min_t(size_t, ALG_MAX_PAGES, DIV_ROUND_UP(sk->sk_sndbuf, PAGE_SIZE)); lock_sock(sk); continuing = ctx->more; if (!continuing) { /* Discard a previous request that wasn't marked MSG_MORE. */ hash_free_result(sk, ctx); if (!msg_data_left(msg)) goto done; /* Zero-length; don't start new req */ need_init = true; } else if (!msg_data_left(msg)) { /* * No data - finalise the prev req if MSG_MORE so any error * comes out here. */ if (!(msg->msg_flags & MSG_MORE)) { err = hash_alloc_result(sk, ctx); if (err) goto unlock_free_result; ahash_request_set_crypt(&ctx->req, NULL, ctx->result, 0); err = crypto_wait_req(crypto_ahash_final(&ctx->req), &ctx->wait); if (err) goto unlock_free_result; } goto done_more; } while (msg_data_left(msg)) { ctx->sgl.sgt.sgl = ctx->sgl.sgl; ctx->sgl.sgt.nents = 0; ctx->sgl.sgt.orig_nents = 0; err = -EIO; npages = iov_iter_npages(&msg->msg_iter, max_pages); if (npages == 0) goto unlock_free; sg_init_table(ctx->sgl.sgl, npages); ctx->sgl.need_unpin = iov_iter_extract_will_pin(&msg->msg_iter); err = extract_iter_to_sg(&msg->msg_iter, LONG_MAX, &ctx->sgl.sgt, npages, 0); if (err < 0) goto unlock_free; len = err; sg_mark_end(ctx->sgl.sgt.sgl + ctx->sgl.sgt.nents - 1); if (!msg_data_left(msg)) { err = hash_alloc_result(sk, ctx); if (err) goto unlock_free; } ahash_request_set_crypt(&ctx->req, ctx->sgl.sgt.sgl, ctx->result, len); if (!msg_data_left(msg) && !continuing && !(msg->msg_flags & MSG_MORE)) { err = crypto_ahash_digest(&ctx->req); } else { if (need_init) { err = crypto_wait_req( crypto_ahash_init(&ctx->req), &ctx->wait); if (err) goto unlock_free; need_init = false; } if (msg_data_left(msg) || (msg->msg_flags & MSG_MORE)) err = crypto_ahash_update(&ctx->req); else err = crypto_ahash_finup(&ctx->req); continuing = true; } err = crypto_wait_req(err, &ctx->wait); if (err) goto unlock_free; copied += len; af_alg_free_sg(&ctx->sgl); } done_more: ctx->more = msg->msg_flags & MSG_MORE; done: err = 0; unlock: release_sock(sk); return copied ?: err; unlock_free: af_alg_free_sg(&ctx->sgl); unlock_free_result: hash_free_result(sk, ctx); ctx->more = false; goto unlock; } static int hash_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct alg_sock *ask = alg_sk(sk); struct hash_ctx *ctx = ask->private; unsigned ds = crypto_ahash_digestsize(crypto_ahash_reqtfm(&ctx->req)); bool result; int err; if (len > ds) len = ds; else if (len < ds) msg->msg_flags |= MSG_TRUNC; lock_sock(sk); result = ctx->result; err = hash_alloc_result(sk, ctx); if (err) goto unlock; ahash_request_set_crypt(&ctx->req, NULL, ctx->result, 0); if (!result && !ctx->more) { err = crypto_wait_req(crypto_ahash_init(&ctx->req), &ctx->wait); if (err) goto unlock; } if (!result || ctx->more) { ctx->more = false; err = crypto_wait_req(crypto_ahash_final(&ctx->req), &ctx->wait); if (err) goto unlock; } err = memcpy_to_msg(msg, ctx->result, len); unlock: hash_free_result(sk, ctx); release_sock(sk); return err ?: len; } static int hash_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { struct sock *sk = sock->sk; struct alg_sock *ask = alg_sk(sk); struct hash_ctx *ctx = ask->private; struct ahash_request *req = &ctx->req; struct crypto_ahash *tfm; struct sock *sk2; struct alg_sock *ask2; struct hash_ctx *ctx2; char *state; bool more; int err; tfm = crypto_ahash_reqtfm(req); state = kmalloc(crypto_ahash_statesize(tfm), GFP_KERNEL); err = -ENOMEM; if (!state) goto out; lock_sock(sk); more = ctx->more; err = more ? crypto_ahash_export(req, state) : 0; release_sock(sk); if (err) goto out_free_state; err = af_alg_accept(ask->parent, newsock, arg); if (err) goto out_free_state; sk2 = newsock->sk; ask2 = alg_sk(sk2); ctx2 = ask2->private; ctx2->more = more; if (!more) goto out_free_state; err = crypto_ahash_import(&ctx2->req, state); out_free_state: kfree_sensitive(state); out: return err; } static struct proto_ops algif_hash_ops = { .family = PF_ALG, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .getname = sock_no_getname, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .mmap = sock_no_mmap, .bind = sock_no_bind, .release = af_alg_release, .sendmsg = hash_sendmsg, .recvmsg = hash_recvmsg, .accept = hash_accept, }; static int hash_check_key(struct socket *sock) { int err = 0; struct sock *psk; struct alg_sock *pask; struct crypto_ahash *tfm; struct sock *sk = sock->sk; struct alg_sock *ask = alg_sk(sk); lock_sock(sk); if (!atomic_read(&ask->nokey_refcnt)) goto unlock_child; psk = ask->parent; pask = alg_sk(ask->parent); tfm = pask->private; err = -ENOKEY; lock_sock_nested(psk, SINGLE_DEPTH_NESTING); if (crypto_ahash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) goto unlock; atomic_dec(&pask->nokey_refcnt); atomic_set(&ask->nokey_refcnt, 0); err = 0; unlock: release_sock(psk); unlock_child: release_sock(sk); return err; } static int hash_sendmsg_nokey(struct socket *sock, struct msghdr *msg, size_t size) { int err; err = hash_check_key(sock); if (err) return err; return hash_sendmsg(sock, msg, size); } static int hash_recvmsg_nokey(struct socket *sock, struct msghdr *msg, size_t ignored, int flags) { int err; err = hash_check_key(sock); if (err) return err; return hash_recvmsg(sock, msg, ignored, flags); } static int hash_accept_nokey(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { int err; err = hash_check_key(sock); if (err) return err; return hash_accept(sock, newsock, arg); } static struct proto_ops algif_hash_ops_nokey = { .family = PF_ALG, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .getname = sock_no_getname, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .mmap = sock_no_mmap, .bind = sock_no_bind, .release = af_alg_release, .sendmsg = hash_sendmsg_nokey, .recvmsg = hash_recvmsg_nokey, .accept = hash_accept_nokey, }; static void *hash_bind(const char *name, u32 type, u32 mask) { return crypto_alloc_ahash(name, type, mask); } static void hash_release(void *private) { crypto_free_ahash(private); } static int hash_setkey(void *private, const u8 *key, unsigned int keylen) { return crypto_ahash_setkey(private, key, keylen); } static void hash_sock_destruct(struct sock *sk) { struct alg_sock *ask = alg_sk(sk); struct hash_ctx *ctx = ask->private; hash_free_result(sk, ctx); sock_kfree_s(sk, ctx, ctx->len); af_alg_release_parent(sk); } static int hash_accept_parent_nokey(void *private, struct sock *sk) { struct crypto_ahash *tfm = private; struct alg_sock *ask = alg_sk(sk); struct hash_ctx *ctx; unsigned int len = sizeof(*ctx) + crypto_ahash_reqsize(tfm); ctx = sock_kmalloc(sk, len, GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->result = NULL; ctx->len = len; ctx->more = false; crypto_init_wait(&ctx->wait); ask->private = ctx; ahash_request_set_tfm(&ctx->req, tfm); ahash_request_set_callback(&ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG, crypto_req_done, &ctx->wait); sk->sk_destruct = hash_sock_destruct; return 0; } static int hash_accept_parent(void *private, struct sock *sk) { struct crypto_ahash *tfm = private; if (crypto_ahash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) return -ENOKEY; return hash_accept_parent_nokey(private, sk); } static const struct af_alg_type algif_type_hash = { .bind = hash_bind, .release = hash_release, .setkey = hash_setkey, .accept = hash_accept_parent, .accept_nokey = hash_accept_parent_nokey, .ops = &algif_hash_ops, .ops_nokey = &algif_hash_ops_nokey, .name = "hash", .owner = THIS_MODULE }; static int __init algif_hash_init(void) { return af_alg_register_type(&algif_type_hash); } static void __exit algif_hash_exit(void) { int err = af_alg_unregister_type(&algif_type_hash); BUG_ON(err); } module_init(algif_hash_init); module_exit(algif_hash_exit); MODULE_DESCRIPTION("Userspace interface for hash algorithms"); MODULE_LICENSE("GPL");
56 58 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #ifndef _BR_PRIVATE_STP_H #define _BR_PRIVATE_STP_H #define BPDU_TYPE_CONFIG 0 #define BPDU_TYPE_TCN 0x80 /* IEEE 802.1D-1998 timer values */ #define BR_MIN_HELLO_TIME (1*HZ) #define BR_MAX_HELLO_TIME (10*HZ) #define BR_MIN_FORWARD_DELAY (2*HZ) #define BR_MAX_FORWARD_DELAY (30*HZ) #define BR_MIN_MAX_AGE (6*HZ) #define BR_MAX_MAX_AGE (40*HZ) #define BR_MIN_PATH_COST 1 #define BR_MAX_PATH_COST 65535 struct br_config_bpdu { unsigned int topology_change:1; unsigned int topology_change_ack:1; bridge_id root; int root_path_cost; bridge_id bridge_id; port_id port_id; int message_age; int max_age; int hello_time; int forward_delay; }; /* called under bridge lock */ static inline int br_is_designated_port(const struct net_bridge_port *p) { return !memcmp(&p->designated_bridge, &p->br->bridge_id, 8) && (p->designated_port == p->port_id); } /* br_stp.c */ void br_become_root_bridge(struct net_bridge *br); void br_config_bpdu_generation(struct net_bridge *); void br_configuration_update(struct net_bridge *); void br_port_state_selection(struct net_bridge *); void br_received_config_bpdu(struct net_bridge_port *p, const struct br_config_bpdu *bpdu); void br_received_tcn_bpdu(struct net_bridge_port *p); void br_transmit_config(struct net_bridge_port *p); void br_transmit_tcn(struct net_bridge *br); void br_topology_change_detection(struct net_bridge *br); void __br_set_topology_change(struct net_bridge *br, unsigned char val); /* br_stp_bpdu.c */ void br_send_config_bpdu(struct net_bridge_port *, struct br_config_bpdu *); void br_send_tcn_bpdu(struct net_bridge_port *); #endif
9286 338 338 338 329 108 338 108 338 3359 3352 3536 3253 329 329 329 329 328 328 329 329 329 3255 303 302 3 304 304 304 304 165 12 12 12 12 408 407 6901 6916 6906 6903 3616 3616 5185 5187 9300 9319 7 408 402 408 8988 8990 9014 9014 2886 8703 8710 8714 8710 8709 1476 8711 2095 6999 7 2 2092 1971 3 1966 1952 1966 34 5 34 13 156 7161 5216 300 4992 2 4990 4959 67 6261 6659 167 167 167 165 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1995 Linus Torvalds * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar */ #include <linux/sched.h> /* test_thread_flag(), ... */ #include <linux/sched/task_stack.h> /* task_stack_*(), ... */ #include <linux/kdebug.h> /* oops_begin/end, ... */ #include <linux/memblock.h> /* max_low_pfn */ #include <linux/kfence.h> /* kfence_handle_page_fault */ #include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */ #include <linux/mmiotrace.h> /* kmmio_handler, ... */ #include <linux/perf_event.h> /* perf_sw_event */ #include <linux/hugetlb.h> /* hstate_index_to_shift */ #include <linux/context_tracking.h> /* exception_enter(), ... */ #include <linux/uaccess.h> /* faulthandler_disabled() */ #include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/ #include <linux/mm_types.h> #include <linux/mm.h> /* find_and_lock_vma() */ #include <linux/vmalloc.h> #include <asm/cpufeature.h> /* boot_cpu_has, ... */ #include <asm/traps.h> /* dotraplinkage, ... */ #include <asm/fixmap.h> /* VSYSCALL_ADDR */ #include <asm/vsyscall.h> /* emulate_vsyscall */ #include <asm/vm86.h> /* struct vm86 */ #include <asm/mmu_context.h> /* vma_pkey() */ #include <asm/efi.h> /* efi_crash_gracefully_on_page_fault()*/ #include <asm/desc.h> /* store_idt(), ... */ #include <asm/cpu_entry_area.h> /* exception stack */ #include <asm/pgtable_areas.h> /* VMALLOC_START, ... */ #include <asm/kvm_para.h> /* kvm_handle_async_pf */ #include <asm/vdso.h> /* fixup_vdso_exception() */ #include <asm/irq_stack.h> #include <asm/fred.h> #include <asm/sev.h> /* snp_dump_hva_rmpentry() */ #define CREATE_TRACE_POINTS #include <trace/events/exceptions.h> /* * Returns 0 if mmiotrace is disabled, or if the fault is not * handled by mmiotrace: */ static nokprobe_inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { if (unlikely(is_kmmio_active())) if (kmmio_handler(regs, addr) == 1) return -1; return 0; } /* * Prefetch quirks: * * 32-bit mode: * * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. * Check that here and ignore it. This is AMD erratum #91. * * 64-bit mode: * * Sometimes the CPU reports invalid exceptions on prefetch. * Check that here and ignore it. * * Opcode checker based on code by Richard Brunner. */ static inline int check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, unsigned char opcode, int *prefetch) { unsigned char instr_hi = opcode & 0xf0; unsigned char instr_lo = opcode & 0x0f; switch (instr_hi) { case 0x20: case 0x30: /* * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. * In X86_64 long mode, the CPU will signal invalid * opcode if some of these prefixes are present so * X86_64 will never get here anyway */ return ((instr_lo & 7) == 0x6); #ifdef CONFIG_X86_64 case 0x40: /* * In 64-bit mode 0x40..0x4F are valid REX prefixes */ return (!user_mode(regs) || user_64bit_mode(regs)); #endif case 0x60: /* 0x64 thru 0x67 are valid prefixes in all modes. */ return (instr_lo & 0xC) == 0x4; case 0xF0: /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ return !instr_lo || (instr_lo>>1) == 1; case 0x00: /* Prefetch instruction is 0x0F0D or 0x0F18 */ if (get_kernel_nofault(opcode, instr)) return 0; *prefetch = (instr_lo == 0xF) && (opcode == 0x0D || opcode == 0x18); return 0; default: return 0; } } static bool is_amd_k8_pre_npt(void) { struct cpuinfo_x86 *c = &boot_cpu_data; return unlikely(IS_ENABLED(CONFIG_CPU_SUP_AMD) && c->x86_vendor == X86_VENDOR_AMD && c->x86 == 0xf && c->x86_model < 0x40); } static int is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) { unsigned char *max_instr; unsigned char *instr; int prefetch = 0; /* Erratum #91 affects AMD K8, pre-NPT CPUs */ if (!is_amd_k8_pre_npt()) return 0; /* * If it was a exec (instruction fetch) fault on NX page, then * do not ignore the fault: */ if (error_code & X86_PF_INSTR) return 0; instr = (void *)convert_ip_to_linear(current, regs); max_instr = instr + 15; /* * This code has historically always bailed out if IP points to a * not-present page (e.g. due to a race). No one has ever * complained about this. */ pagefault_disable(); while (instr < max_instr) { unsigned char opcode; if (user_mode(regs)) { if (get_user(opcode, (unsigned char __user *) instr)) break; } else { if (get_kernel_nofault(opcode, instr)) break; } instr++; if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) break; } pagefault_enable(); return prefetch; } DEFINE_SPINLOCK(pgd_lock); LIST_HEAD(pgd_list); #ifdef CONFIG_X86_32 static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) { unsigned index = pgd_index(address); pgd_t *pgd_k; p4d_t *p4d, *p4d_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; pgd += index; pgd_k = init_mm.pgd + index; if (!pgd_present(*pgd_k)) return NULL; /* * set_pgd(pgd, *pgd_k); here would be useless on PAE * and redundant with the set_pmd() on non-PAE. As would * set_p4d/set_pud. */ p4d = p4d_offset(pgd, address); p4d_k = p4d_offset(pgd_k, address); if (!p4d_present(*p4d_k)) return NULL; pud = pud_offset(p4d, address); pud_k = pud_offset(p4d_k, address); if (!pud_present(*pud_k)) return NULL; pmd = pmd_offset(pud, address); pmd_k = pmd_offset(pud_k, address); if (pmd_present(*pmd) != pmd_present(*pmd_k)) set_pmd(pmd, *pmd_k); if (!pmd_present(*pmd_k)) return NULL; else BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k)); return pmd_k; } /* * Handle a fault on the vmalloc or module mapping area * * This is needed because there is a race condition between the time * when the vmalloc mapping code updates the PMD to the point in time * where it synchronizes this update with the other page-tables in the * system. * * In this race window another thread/CPU can map an area on the same * PMD, finds it already present and does not synchronize it with the * rest of the system yet. As a result v[mz]alloc might return areas * which are not mapped in every page-table in the system, causing an * unhandled page-fault when they are accessed. */ static noinline int vmalloc_fault(unsigned long address) { unsigned long pgd_paddr; pmd_t *pmd_k; pte_t *pte_k; /* Make sure we are in vmalloc area: */ if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; /* * Synchronize this task's top level page-table * with the 'reference' page table. * * Do _not_ use "current" here. We might be inside * an interrupt in the middle of a task switch.. */ pgd_paddr = read_cr3_pa(); pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); if (!pmd_k) return -1; if (pmd_leaf(*pmd_k)) return 0; pte_k = pte_offset_kernel(pmd_k, address); if (!pte_present(*pte_k)) return -1; return 0; } NOKPROBE_SYMBOL(vmalloc_fault); void arch_sync_kernel_mappings(unsigned long start, unsigned long end) { unsigned long addr; for (addr = start & PMD_MASK; addr >= TASK_SIZE_MAX && addr < VMALLOC_END; addr += PMD_SIZE) { struct page *page; spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { spinlock_t *pgt_lock; /* the pgt_lock only for Xen */ pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); vmalloc_sync_one(page_address(page), addr); spin_unlock(pgt_lock); } spin_unlock(&pgd_lock); } } static bool low_pfn(unsigned long pfn) { return pfn < max_low_pfn; } static void dump_pagetable(unsigned long address) { pgd_t *base = __va(read_cr3_pa()); pgd_t *pgd = &base[pgd_index(address)]; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; #ifdef CONFIG_X86_PAE pr_info("*pdpt = %016Lx ", pgd_val(*pgd)); if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) goto out; #define pr_pde pr_cont #else #define pr_pde pr_info #endif p4d = p4d_offset(pgd, address); pud = pud_offset(p4d, address); pmd = pmd_offset(pud, address); pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); #undef pr_pde /* * We must not directly access the pte in the highpte * case if the page table is located in highmem. * And let's rather not kmap-atomic the pte, just in case * it's allocated already: */ if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_leaf(*pmd)) goto out; pte = pte_offset_kernel(pmd, address); pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); out: pr_cont("\n"); } #else /* CONFIG_X86_64: */ #ifdef CONFIG_CPU_SUP_AMD static const char errata93_warning[] = KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" "******* Working around it, but it may cause SEGVs or burn power.\n" "******* Please consider a BIOS update.\n" "******* Disabling USB legacy in the BIOS may also help.\n"; #endif static int bad_address(void *p) { unsigned long dummy; return get_kernel_nofault(dummy, (unsigned long *)p); } static void dump_pagetable(unsigned long address) { pgd_t *base = __va(read_cr3_pa()); pgd_t *pgd = base + pgd_index(address); p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; if (bad_address(pgd)) goto bad; pr_info("PGD %lx ", pgd_val(*pgd)); if (!pgd_present(*pgd)) goto out; p4d = p4d_offset(pgd, address); if (bad_address(p4d)) goto bad; pr_cont("P4D %lx ", p4d_val(*p4d)); if (!p4d_present(*p4d) || p4d_leaf(*p4d)) goto out; pud = pud_offset(p4d, address); if (bad_address(pud)) goto bad; pr_cont("PUD %lx ", pud_val(*pud)); if (!pud_present(*pud) || pud_leaf(*pud)) goto out; pmd = pmd_offset(pud, address); if (bad_address(pmd)) goto bad; pr_cont("PMD %lx ", pmd_val(*pmd)); if (!pmd_present(*pmd) || pmd_leaf(*pmd)) goto out; pte = pte_offset_kernel(pmd, address); if (bad_address(pte)) goto bad; pr_cont("PTE %lx", pte_val(*pte)); out: pr_cont("\n"); return; bad: pr_info("BAD\n"); } #endif /* CONFIG_X86_64 */ /* * Workaround for K8 erratum #93 & buggy BIOS. * * BIOS SMM functions are required to use a specific workaround * to avoid corruption of the 64bit RIP register on C stepping K8. * * A lot of BIOS that didn't get tested properly miss this. * * The OS sees this as a page fault with the upper 32bits of RIP cleared. * Try to work around it here. * * Note we only handle faults in kernel here. * Does nothing on 32-bit. */ static int is_errata93(struct pt_regs *regs, unsigned long address) { #if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD) if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD || boot_cpu_data.x86 != 0xf) return 0; if (user_mode(regs)) return 0; if (address != regs->ip) return 0; if ((address >> 32) != 0) return 0; address |= 0xffffffffUL << 32; if ((address >= (u64)_stext && address <= (u64)_etext) || (address >= MODULES_VADDR && address <= MODULES_END)) { printk_once(errata93_warning); regs->ip = address; return 1; } #endif return 0; } /* * Work around K8 erratum #100 K8 in compat mode occasionally jumps * to illegal addresses >4GB. * * We catch this in the page fault handler because these addresses * are not reachable. Just detect this case and return. Any code * segment in LDT is compatibility mode. */ static int is_errata100(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_64 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32)) return 1; #endif return 0; } /* Pentium F0 0F C7 C8 bug workaround: */ static int is_f00f_bug(struct pt_regs *regs, unsigned long error_code, unsigned long address) { #ifdef CONFIG_X86_F00F_BUG if (boot_cpu_has_bug(X86_BUG_F00F) && !(error_code & X86_PF_USER) && idt_is_f00f_address(address)) { handle_invalid_op(regs); return 1; } #endif return 0; } static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index) { u32 offset = (index >> 3) * sizeof(struct desc_struct); unsigned long addr; struct ldttss_desc desc; if (index == 0) { pr_alert("%s: NULL\n", name); return; } if (offset + sizeof(struct ldttss_desc) >= gdt->size) { pr_alert("%s: 0x%hx -- out of bounds\n", name, index); return; } if (copy_from_kernel_nofault(&desc, (void *)(gdt->address + offset), sizeof(struct ldttss_desc))) { pr_alert("%s: 0x%hx -- GDT entry is not readable\n", name, index); return; } addr = desc.base0 | (desc.base1 << 16) | ((unsigned long)desc.base2 << 24); #ifdef CONFIG_X86_64 addr |= ((u64)desc.base3 << 32); #endif pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n", name, index, addr, (desc.limit0 | (desc.limit1 << 16))); } static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) { if (!oops_may_print()) return; if (error_code & X86_PF_INSTR) { unsigned int level; bool nx, rw; pgd_t *pgd; pte_t *pte; pgd = __va(read_cr3_pa()); pgd += pgd_index(address); pte = lookup_address_in_pgd_attr(pgd, address, &level, &nx, &rw); if (pte && pte_present(*pte) && (!pte_exec(*pte) || nx)) pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n", from_kuid(&init_user_ns, current_uid())); if (pte && pte_present(*pte) && pte_exec(*pte) && !nx && (pgd_flags(*pgd) & _PAGE_USER) && (__read_cr4() & X86_CR4_SMEP)) pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n", from_kuid(&init_user_ns, current_uid())); } if (address < PAGE_SIZE && !user_mode(regs)) pr_alert("BUG: kernel NULL pointer dereference, address: %px\n", (void *)address); else pr_alert("BUG: unable to handle page fault for address: %px\n", (void *)address); pr_alert("#PF: %s %s in %s mode\n", (error_code & X86_PF_USER) ? "user" : "supervisor", (error_code & X86_PF_INSTR) ? "instruction fetch" : (error_code & X86_PF_WRITE) ? "write access" : "read access", user_mode(regs) ? "user" : "kernel"); pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code, !(error_code & X86_PF_PROT) ? "not-present page" : (error_code & X86_PF_RSVD) ? "reserved bit violation" : (error_code & X86_PF_PK) ? "protection keys violation" : (error_code & X86_PF_RMP) ? "RMP violation" : "permissions violation"); if (!(error_code & X86_PF_USER) && user_mode(regs)) { struct desc_ptr idt, gdt; u16 ldtr, tr; /* * This can happen for quite a few reasons. The more obvious * ones are faults accessing the GDT, or LDT. Perhaps * surprisingly, if the CPU tries to deliver a benign or * contributory exception from user code and gets a page fault * during delivery, the page fault can be delivered as though * it originated directly from user code. This could happen * due to wrong permissions on the IDT, GDT, LDT, TSS, or * kernel or IST stack. */ store_idt(&idt); /* Usable even on Xen PV -- it's just slow. */ native_store_gdt(&gdt); pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n", idt.address, idt.size, gdt.address, gdt.size); store_ldt(ldtr); show_ldttss(&gdt, "LDTR", ldtr); store_tr(tr); show_ldttss(&gdt, "TR", tr); } dump_pagetable(address); if (error_code & X86_PF_RMP) snp_dump_hva_rmpentry(address); } static noinline void pgtable_bad(struct pt_regs *regs, unsigned long error_code, unsigned long address) { struct task_struct *tsk; unsigned long flags; int sig; flags = oops_begin(); tsk = current; sig = SIGKILL; printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", tsk->comm, address); dump_pagetable(address); if (__die("Bad pagetable", regs, error_code)) sig = 0; oops_end(flags, regs, sig); } static void sanitize_error_code(unsigned long address, unsigned long *error_code) { /* * To avoid leaking information about the kernel page * table layout, pretend that user-mode accesses to * kernel addresses are always protection faults. * * NB: This means that failed vsyscalls with vsyscall=none * will have the PROT bit. This doesn't leak any * information and does not appear to cause any problems. */ if (address >= TASK_SIZE_MAX) *error_code |= X86_PF_PROT; } static void set_signal_archinfo(unsigned long address, unsigned long error_code) { struct task_struct *tsk = current; tsk->thread.trap_nr = X86_TRAP_PF; tsk->thread.error_code = error_code | X86_PF_USER; tsk->thread.cr2 = address; } static noinline void page_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) { #ifdef CONFIG_VMAP_STACK struct stack_info info; #endif unsigned long flags; int sig; if (user_mode(regs)) { /* * Implicit kernel access from user mode? Skip the stack * overflow and EFI special cases. */ goto oops; } #ifdef CONFIG_VMAP_STACK /* * Stack overflow? During boot, we can fault near the initial * stack in the direct map, but that's not an overflow -- check * that we're in vmalloc space to avoid this. */ if (is_vmalloc_addr((void *)address) && get_stack_guard_info((void *)address, &info)) { /* * We're likely to be running with very little stack space * left. It's plausible that we'd hit this condition but * double-fault even before we get this far, in which case * we're fine: the double-fault handler will deal with it. * * We don't want to make it all the way into the oops code * and then double-fault, though, because we're likely to * break the console driver and lose most of the stack dump. */ call_on_stack(__this_cpu_ist_top_va(DF) - sizeof(void*), handle_stack_overflow, ASM_CALL_ARG3, , [arg1] "r" (regs), [arg2] "r" (address), [arg3] "r" (&info)); BUG(); } #endif /* * Buggy firmware could access regions which might page fault. If * this happens, EFI has a special OOPS path that will try to * avoid hanging the system. */ if (IS_ENABLED(CONFIG_EFI)) efi_crash_gracefully_on_page_fault(address); /* Only not-present faults should be handled by KFENCE. */ if (!(error_code & X86_PF_PROT) && kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs)) return; oops: /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice: */ flags = oops_begin(); show_fault_oops(regs, error_code, address); if (task_stack_end_corrupted(current)) printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); sig = SIGKILL; if (__die("Oops", regs, error_code)) sig = 0; /* Executive summary in case the body of the oops scrolled away */ printk(KERN_DEFAULT "CR2: %016lx\n", address); oops_end(flags, regs, sig); } static noinline void kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address, int signal, int si_code, u32 pkey) { WARN_ON_ONCE(user_mode(regs)); /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) return; /* * AMD erratum #91 manifests as a spurious page fault on a PREFETCH * instruction. */ if (is_prefetch(regs, error_code, address)) return; page_fault_oops(regs, error_code, address); } /* * Print out info about fatal segfaults, if the show_unhandled_signals * sysctl is set: */ static inline void show_signal_msg(struct pt_regs *regs, unsigned long error_code, unsigned long address, struct task_struct *tsk) { const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG; /* This is a racy snapshot, but it's better than nothing. */ int cpu = raw_smp_processor_id(); if (!unhandled_signal(tsk, SIGSEGV)) return; if (!printk_ratelimit()) return; printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx", loglvl, tsk->comm, task_pid_nr(tsk), address, (void *)regs->ip, (void *)regs->sp, error_code); print_vma_addr(KERN_CONT " in ", regs->ip); /* * Dump the likely CPU where the fatal segfault happened. * This can help identify faulty hardware. */ printk(KERN_CONT " likely on CPU %d (core %d, socket %d)", cpu, topology_core_id(cpu), topology_physical_package_id(cpu)); printk(KERN_CONT "\n"); show_opcodes(regs, loglvl); } static void __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, unsigned long address, u32 pkey, int si_code) { struct task_struct *tsk = current; if (!user_mode(regs)) { kernelmode_fixup_or_oops(regs, error_code, address, SIGSEGV, si_code, pkey); return; } if (!(error_code & X86_PF_USER)) { /* Implicit user access to kernel memory -- just oops */ page_fault_oops(regs, error_code, address); return; } /* * User mode accesses just cause a SIGSEGV. * It's possible to have interrupts off here: */ local_irq_enable(); /* * Valid to do another page fault here because this one came * from user space: */ if (is_prefetch(regs, error_code, address)) return; if (is_errata100(regs, address)) return; sanitize_error_code(address, &error_code); if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address)) return; if (likely(show_unhandled_signals)) show_signal_msg(regs, error_code, address, tsk); set_signal_archinfo(address, error_code); if (si_code == SEGV_PKUERR) force_sig_pkuerr((void __user *)address, pkey); else force_sig_fault(SIGSEGV, si_code, (void __user *)address); local_irq_disable(); } static noinline void bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, unsigned long address) { __bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR); } static void __bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address, struct mm_struct *mm, struct vm_area_struct *vma, u32 pkey, int si_code) { /* * Something tried to access memory that isn't in our memory map.. * Fix it, but check if it's kernel or user first.. */ if (mm) mmap_read_unlock(mm); else vma_end_read(vma); __bad_area_nosemaphore(regs, error_code, address, pkey, si_code); } static inline bool bad_area_access_from_pkeys(unsigned long error_code, struct vm_area_struct *vma) { /* This code is always called on the current mm */ bool foreign = false; if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return false; if (error_code & X86_PF_PK) return true; /* this checks permission keys on the VMA: */ if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE), (error_code & X86_PF_INSTR), foreign)) return true; return false; } static noinline void bad_area_access_error(struct pt_regs *regs, unsigned long error_code, unsigned long address, struct mm_struct *mm, struct vm_area_struct *vma) { /* * This OSPKE check is not strictly necessary at runtime. * But, doing it this way allows compiler optimizations * if pkeys are compiled out. */ if (bad_area_access_from_pkeys(error_code, vma)) { /* * A protection key fault means that the PKRU value did not allow * access to some PTE. Userspace can figure out what PKRU was * from the XSAVE state. This function captures the pkey from * the vma and passes it to userspace so userspace can discover * which protection key was set on the PTE. * * If we get here, we know that the hardware signaled a X86_PF_PK * fault and that there was a VMA once we got in the fault * handler. It does *not* guarantee that the VMA we find here * was the one that we faulted on. * * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4); * 2. T1 : set PKRU to deny access to pkey=4, touches page * 3. T1 : faults... * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5); * 5. T1 : enters fault handler, takes mmap_lock, etc... * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really * faulted on a pte with its pkey=4. */ u32 pkey = vma_pkey(vma); __bad_area(regs, error_code, address, mm, vma, pkey, SEGV_PKUERR); } else { __bad_area(regs, error_code, address, mm, vma, 0, SEGV_ACCERR); } } static void do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, vm_fault_t fault) { /* Kernel mode? Handle exceptions or die: */ if (!user_mode(regs)) { kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR, ARCH_DEFAULT_PKEY); return; } /* User-space => ok to do another page fault: */ if (is_prefetch(regs, error_code, address)) return; sanitize_error_code(address, &error_code); if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address)) return; set_signal_archinfo(address, error_code); #ifdef CONFIG_MEMORY_FAILURE if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { struct task_struct *tsk = current; unsigned lsb = 0; pr_err( "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", tsk->comm, tsk->pid, address); if (fault & VM_FAULT_HWPOISON_LARGE) lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); if (fault & VM_FAULT_HWPOISON) lsb = PAGE_SHIFT; force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb); return; } #endif force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); } static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte) { if ((error_code & X86_PF_WRITE) && !pte_write(*pte)) return 0; if ((error_code & X86_PF_INSTR) && !pte_exec(*pte)) return 0; return 1; } /* * Handle a spurious fault caused by a stale TLB entry. * * This allows us to lazily refresh the TLB when increasing the * permissions of a kernel page (RO -> RW or NX -> X). Doing it * eagerly is very expensive since that implies doing a full * cross-processor TLB flush, even if no stale TLB entries exist * on other processors. * * Spurious faults may only occur if the TLB contains an entry with * fewer permission than the page table entry. Non-present (P = 0) * and reserved bit (R = 1) faults are never spurious. * * There are no security implications to leaving a stale TLB when * increasing the permissions on a page. * * Returns non-zero if a spurious fault was handled, zero otherwise. * * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3 * (Optional Invalidation). */ static noinline int spurious_kernel_fault(unsigned long error_code, unsigned long address) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; int ret; /* * Only writes to RO or instruction fetches from NX may cause * spurious faults. * * These could be from user or supervisor accesses but the TLB * is only lazily flushed after a kernel mapping protection * change, so user accesses are not expected to cause spurious * faults. */ if (error_code != (X86_PF_WRITE | X86_PF_PROT) && error_code != (X86_PF_INSTR | X86_PF_PROT)) return 0; pgd = init_mm.pgd + pgd_index(address); if (!pgd_present(*pgd)) return 0; p4d = p4d_offset(pgd, address); if (!p4d_present(*p4d)) return 0; if (p4d_leaf(*p4d)) return spurious_kernel_fault_check(error_code, (pte_t *) p4d); pud = pud_offset(p4d, address); if (!pud_present(*pud)) return 0; if (pud_leaf(*pud)) return spurious_kernel_fault_check(error_code, (pte_t *) pud); pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return 0; if (pmd_leaf(*pmd)) return spurious_kernel_fault_check(error_code, (pte_t *) pmd); pte = pte_offset_kernel(pmd, address); if (!pte_present(*pte)) return 0; ret = spurious_kernel_fault_check(error_code, pte); if (!ret) return 0; /* * Make sure we have permissions in PMD. * If not, then there's a bug in the page tables: */ ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd); WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); return ret; } NOKPROBE_SYMBOL(spurious_kernel_fault); int show_unhandled_signals = 1; static inline int access_error(unsigned long error_code, struct vm_area_struct *vma) { /* This is only called for the current mm, so: */ bool foreign = false; /* * Read or write was blocked by protection keys. This is * always an unconditional error and can never result in * a follow-up action to resolve the fault, like a COW. */ if (error_code & X86_PF_PK) return 1; /* * SGX hardware blocked the access. This usually happens * when the enclave memory contents have been destroyed, like * after a suspend/resume cycle. In any case, the kernel can't * fix the cause of the fault. Handle the fault as an access * error even in cases where no actual access violation * occurred. This allows userspace to rebuild the enclave in * response to the signal. */ if (unlikely(error_code & X86_PF_SGX)) return 1; /* * Make sure to check the VMA so that we do not perform * faults just to hit a X86_PF_PK as soon as we fill in a * page. */ if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE), (error_code & X86_PF_INSTR), foreign)) return 1; /* * Shadow stack accesses (PF_SHSTK=1) are only permitted to * shadow stack VMAs. All other accesses result in an error. */ if (error_code & X86_PF_SHSTK) { if (unlikely(!(vma->vm_flags & VM_SHADOW_STACK))) return 1; if (unlikely(!(vma->vm_flags & VM_WRITE))) return 1; return 0; } if (error_code & X86_PF_WRITE) { /* write, present and write, not present: */ if (unlikely(vma->vm_flags & VM_SHADOW_STACK)) return 1; if (unlikely(!(vma->vm_flags & VM_WRITE))) return 1; return 0; } /* read, present: */ if (unlikely(error_code & X86_PF_PROT)) return 1; /* read, not present: */ if (unlikely(!vma_is_accessible(vma))) return 1; return 0; } bool fault_in_kernel_space(unsigned long address) { /* * On 64-bit systems, the vsyscall page is at an address above * TASK_SIZE_MAX, but is not considered part of the kernel * address space. */ if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address)) return false; return address >= TASK_SIZE_MAX; } /* * Called for all faults where 'address' is part of the kernel address * space. Might get called for faults that originate from *code* that * ran in userspace or the kernel. */ static void do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, unsigned long address) { /* * Protection keys exceptions only happen on user pages. We * have no user pages in the kernel portion of the address * space, so do not expect them here. */ WARN_ON_ONCE(hw_error_code & X86_PF_PK); #ifdef CONFIG_X86_32 /* * We can fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. * * NOTE! We MUST NOT take any locks for this case. We may * be in an interrupt or a critical region, and should * only copy the information from the master page table, * nothing more. * * Before doing this on-demand faulting, ensure that the * fault is not any of the following: * 1. A fault on a PTE with a reserved bit set. * 2. A fault caused by a user-mode access. (Do not demand- * fault kernel memory due to user-mode accesses). * 3. A fault caused by a page-level protection violation. * (A demand fault would be on a non-present page which * would have X86_PF_PROT==0). * * This is only needed to close a race condition on x86-32 in * the vmalloc mapping/unmapping code. See the comment above * vmalloc_fault() for details. On x86-64 the race does not * exist as the vmalloc mappings don't need to be synchronized * there. */ if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { if (vmalloc_fault(address) >= 0) return; } #endif if (is_f00f_bug(regs, hw_error_code, address)) return; /* Was the fault spurious, caused by lazy TLB invalidation? */ if (spurious_kernel_fault(hw_error_code, address)) return; /* kprobes don't want to hook the spurious faults: */ if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF))) return; /* * Note, despite being a "bad area", there are quite a few * acceptable reasons to get here, such as erratum fixups * and handling kernel code that can fault, like get_user(). * * Don't take the mm semaphore here. If we fixup a prefetch * fault we could otherwise deadlock: */ bad_area_nosemaphore(regs, hw_error_code, address); } NOKPROBE_SYMBOL(do_kern_addr_fault); /* * Handle faults in the user portion of the address space. Nothing in here * should check X86_PF_USER without a specific justification: for almost * all purposes, we should treat a normal kernel access to user memory * (e.g. get_user(), put_user(), etc.) the same as the WRUSS instruction. * The one exception is AC flag handling, which is, per the x86 * architecture, special for WRUSS. */ static inline void do_user_addr_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) { struct vm_area_struct *vma; struct task_struct *tsk; struct mm_struct *mm; vm_fault_t fault; unsigned int flags = FAULT_FLAG_DEFAULT; tsk = current; mm = tsk->mm; if (unlikely((error_code & (X86_PF_USER | X86_PF_INSTR)) == X86_PF_INSTR)) { /* * Whoops, this is kernel mode code trying to execute from * user memory. Unless this is AMD erratum #93, which * corrupts RIP such that it looks like a user address, * this is unrecoverable. Don't even try to look up the * VMA or look for extable entries. */ if (is_errata93(regs, address)) return; page_fault_oops(regs, error_code, address); return; } /* kprobes don't want to hook the spurious faults: */ if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF))) return; /* * Reserved bits are never expected to be set on * entries in the user portion of the page tables. */ if (unlikely(error_code & X86_PF_RSVD)) pgtable_bad(regs, error_code, address); /* * If SMAP is on, check for invalid kernel (supervisor) access to user * pages in the user address space. The odd case here is WRUSS, * which, according to the preliminary documentation, does not respect * SMAP and will have the USER bit set so, in all cases, SMAP * enforcement appears to be consistent with the USER bit. */ if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) && !(error_code & X86_PF_USER) && !(regs->flags & X86_EFLAGS_AC))) { /* * No extable entry here. This was a kernel access to an * invalid pointer. get_kernel_nofault() will not get here. */ page_fault_oops(regs, error_code, address); return; } /* * If we're in an interrupt, have no user context or are running * in a region with pagefaults disabled then we must not take the fault */ if (unlikely(faulthandler_disabled() || !mm)) { bad_area_nosemaphore(regs, error_code, address); return; } /* Legacy check - remove this after verifying that it doesn't trigger */ if (WARN_ON_ONCE(!(regs->flags & X86_EFLAGS_IF))) { bad_area_nosemaphore(regs, error_code, address); return; } local_irq_enable(); perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); /* * Read-only permissions can not be expressed in shadow stack PTEs. * Treat all shadow stack accesses as WRITE faults. This ensures * that the MM will prepare everything (e.g., break COW) such that * maybe_mkwrite() can create a proper shadow stack PTE. */ if (error_code & X86_PF_SHSTK) flags |= FAULT_FLAG_WRITE; if (error_code & X86_PF_WRITE) flags |= FAULT_FLAG_WRITE; if (error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; /* * We set FAULT_FLAG_USER based on the register state, not * based on X86_PF_USER. User space accesses that cause * system page faults are still user accesses. */ if (user_mode(regs)) flags |= FAULT_FLAG_USER; #ifdef CONFIG_X86_64 /* * Faults in the vsyscall page might need emulation. The * vsyscall page is at a high address (>PAGE_OFFSET), but is * considered to be part of the user address space. * * The vsyscall page does not have a "real" VMA, so do this * emulation before we go searching for VMAs. * * PKRU never rejects instruction fetches, so we don't need * to consider the PF_PK bit. */ if (is_vsyscall_vaddr(address)) { if (emulate_vsyscall(error_code, regs, address)) return; } #endif if (!(flags & FAULT_FLAG_USER)) goto lock_mmap; vma = lock_vma_under_rcu(mm, address); if (!vma) goto lock_mmap; if (unlikely(access_error(error_code, vma))) { bad_area_access_error(regs, error_code, address, NULL, vma); count_vm_vma_lock_event(VMA_LOCK_SUCCESS); return; } fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); goto done; } count_vm_vma_lock_event(VMA_LOCK_RETRY); if (fault & VM_FAULT_MAJOR) flags |= FAULT_FLAG_TRIED; /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { if (!user_mode(regs)) kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR, ARCH_DEFAULT_PKEY); return; } lock_mmap: retry: vma = lock_mm_and_find_vma(mm, address, regs); if (unlikely(!vma)) { bad_area_nosemaphore(regs, error_code, address); return; } /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ if (unlikely(access_error(error_code, vma))) { bad_area_access_error(regs, error_code, address, mm, vma); return; } /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if * we get VM_FAULT_RETRY back, the mmap_lock has been unlocked. * * Note that handle_userfault() may also release and reacquire mmap_lock * (and not return with VM_FAULT_RETRY), when returning to userland to * repeat the page fault later with a VM_FAULT_NOPAGE retval * (potentially after handling any pending signal during the return to * userland). The return to userland is identified whenever * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags. */ fault = handle_mm_fault(vma, address, flags, regs); if (fault_signal_pending(fault, regs)) { /* * Quick path to respond to signals. The core mm code * has unlocked the mm for us if we get here. */ if (!user_mode(regs)) kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR, ARCH_DEFAULT_PKEY); return; } /* The fault is fully completed (including releasing mmap lock) */ if (fault & VM_FAULT_COMPLETED) return; /* * If we need to retry the mmap_lock has already been released, * and if there is a fatal signal pending there is no guarantee * that we made any progress. Handle this case first. */ if (unlikely(fault & VM_FAULT_RETRY)) { flags |= FAULT_FLAG_TRIED; goto retry; } mmap_read_unlock(mm); done: if (likely(!(fault & VM_FAULT_ERROR))) return; if (fatal_signal_pending(current) && !user_mode(regs)) { kernelmode_fixup_or_oops(regs, error_code, address, 0, 0, ARCH_DEFAULT_PKEY); return; } if (fault & VM_FAULT_OOM) { /* Kernel mode? Handle exceptions or die: */ if (!user_mode(regs)) { kernelmode_fixup_or_oops(regs, error_code, address, SIGSEGV, SEGV_MAPERR, ARCH_DEFAULT_PKEY); return; } /* * We ran out of memory, call the OOM killer, and return the * userspace (which will retry the fault, or kill us if we got * oom-killed): */ pagefault_out_of_memory(); } else { if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| VM_FAULT_HWPOISON_LARGE)) do_sigbus(regs, error_code, address, fault); else if (fault & VM_FAULT_SIGSEGV) bad_area_nosemaphore(regs, error_code, address); else BUG(); } } NOKPROBE_SYMBOL(do_user_addr_fault); static __always_inline void trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code, unsigned long address) { if (user_mode(regs)) trace_page_fault_user(address, regs, error_code); else trace_page_fault_kernel(address, regs, error_code); } static __always_inline void handle_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) { trace_page_fault_entries(regs, error_code, address); if (unlikely(kmmio_fault(regs, address))) return; /* Was the fault on kernel-controlled part of the address space? */ if (unlikely(fault_in_kernel_space(address))) { do_kern_addr_fault(regs, error_code, address); } else { do_user_addr_fault(regs, error_code, address); /* * User address page fault handling might have reenabled * interrupts. Fixing up all potential exit points of * do_user_addr_fault() and its leaf functions is just not * doable w/o creating an unholy mess or turning the code * upside down. */ local_irq_disable(); } } DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) { irqentry_state_t state; unsigned long address; address = cpu_feature_enabled(X86_FEATURE_FRED) ? fred_event_data(regs) : read_cr2(); /* * KVM uses #PF vector to deliver 'page not present' events to guests * (asynchronous page fault mechanism). The event happens when a * userspace task is trying to access some valid (from guest's point of * view) memory which is not currently mapped by the host (e.g. the * memory is swapped out). Note, the corresponding "page ready" event * which is injected when the memory becomes available, is delivered via * an interrupt mechanism and not a #PF exception * (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()). * * We are relying on the interrupted context being sane (valid RSP, * relevant locks not held, etc.), which is fine as long as the * interrupted context had IF=1. We are also relying on the KVM * async pf type field and CR2 being read consistently instead of * getting values from real and async page faults mixed up. * * Fingers crossed. * * The async #PF handling code takes care of idtentry handling * itself. */ if (kvm_handle_async_pf(regs, (u32)address)) return; /* * Entry handling for valid #PF from kernel mode is slightly * different: RCU is already watching and ct_irq_enter() must not * be invoked because a kernel fault on a user space address might * sleep. * * In case the fault hit a RCU idle region the conditional entry * code reenabled RCU to avoid subsequent wreckage which helps * debuggability. */ state = irqentry_enter(regs); instrumentation_begin(); handle_page_fault(regs, error_code, address); instrumentation_end(); irqentry_exit(regs, state); }
26 195 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SUSPEND_H #define _LINUX_SUSPEND_H #include <linux/swap.h> #include <linux/notifier.h> #include <linux/init.h> #include <linux/pm.h> #include <linux/mm.h> #include <linux/freezer.h> #include <asm/errno.h> #ifdef CONFIG_VT extern void pm_set_vt_switch(int); #else static inline void pm_set_vt_switch(int do_switch) { } #endif #ifdef CONFIG_VT_CONSOLE_SLEEP extern void pm_prepare_console(void); extern void pm_restore_console(void); #else static inline void pm_prepare_console(void) { } static inline void pm_restore_console(void) { } #endif typedef int __bitwise suspend_state_t; #define PM_SUSPEND_ON ((__force suspend_state_t) 0) #define PM_SUSPEND_TO_IDLE ((__force suspend_state_t) 1) #define PM_SUSPEND_STANDBY ((__force suspend_state_t) 2) #define PM_SUSPEND_MEM ((__force suspend_state_t) 3) #define PM_SUSPEND_MIN PM_SUSPEND_TO_IDLE #define PM_SUSPEND_MAX ((__force suspend_state_t) 4) /** * struct platform_suspend_ops - Callbacks for managing platform dependent * system sleep states. * * @valid: Callback to determine if given system sleep state is supported by * the platform. * Valid (ie. supported) states are advertised in /sys/power/state. Note * that it still may be impossible to enter given system sleep state if the * conditions aren't right. * There is the %suspend_valid_only_mem function available that can be * assigned to this if the platform only supports mem sleep. * * @begin: Initialise a transition to given system sleep state. * @begin() is executed right prior to suspending devices. The information * conveyed to the platform code by @begin() should be disregarded by it as * soon as @end() is executed. If @begin() fails (ie. returns nonzero), * @prepare(), @enter() and @finish() will not be called by the PM core. * This callback is optional. However, if it is implemented, the argument * passed to @enter() is redundant and should be ignored. * * @prepare: Prepare the platform for entering the system sleep state indicated * by @begin(). * @prepare() is called right after devices have been suspended (ie. the * appropriate .suspend() method has been executed for each device) and * before device drivers' late suspend callbacks are executed. It returns * 0 on success or a negative error code otherwise, in which case the * system cannot enter the desired sleep state (@prepare_late(), @enter(), * and @wake() will not be called in that case). * * @prepare_late: Finish preparing the platform for entering the system sleep * state indicated by @begin(). * @prepare_late is called before disabling nonboot CPUs and after * device drivers' late suspend callbacks have been executed. It returns * 0 on success or a negative error code otherwise, in which case the * system cannot enter the desired sleep state (@enter() will not be * executed). * * @enter: Enter the system sleep state indicated by @begin() or represented by * the argument if @begin() is not implemented. * This callback is mandatory. It returns 0 on success or a negative * error code otherwise, in which case the system cannot enter the desired * sleep state. * * @wake: Called when the system has just left a sleep state, right after * the nonboot CPUs have been enabled and before device drivers' early * resume callbacks are executed. * This callback is optional, but should be implemented by the platforms * that implement @prepare_late(). If implemented, it is always called * after @prepare_late and @enter(), even if one of them fails. * * @finish: Finish wake-up of the platform. * @finish is called right prior to calling device drivers' regular suspend * callbacks. * This callback is optional, but should be implemented by the platforms * that implement @prepare(). If implemented, it is always called after * @enter() and @wake(), even if any of them fails. It is executed after * a failing @prepare. * * @suspend_again: Returns whether the system should suspend again (true) or * not (false). If the platform wants to poll sensors or execute some * code during suspended without invoking userspace and most of devices, * suspend_again callback is the place assuming that periodic-wakeup or * alarm-wakeup is already setup. This allows to execute some codes while * being kept suspended in the view of userland and devices. * * @end: Called by the PM core right after resuming devices, to indicate to * the platform that the system has returned to the working state or * the transition to the sleep state has been aborted. * This callback is optional, but should be implemented by the platforms * that implement @begin(). Accordingly, platforms implementing @begin() * should also provide a @end() which cleans up transitions aborted before * @enter(). * * @recover: Recover the platform from a suspend failure. * Called by the PM core if the suspending of devices fails. * This callback is optional and should only be implemented by platforms * which require special recovery actions in that situation. */ struct platform_suspend_ops { int (*valid)(suspend_state_t state); int (*begin)(suspend_state_t state); int (*prepare)(void); int (*prepare_late)(void); int (*enter)(suspend_state_t state); void (*wake)(void); void (*finish)(void); bool (*suspend_again)(void); void (*end)(void); void (*recover)(void); }; struct platform_s2idle_ops { int (*begin)(void); int (*prepare)(void); int (*prepare_late)(void); void (*check)(void); bool (*wake)(void); void (*restore_early)(void); void (*restore)(void); void (*end)(void); }; #ifdef CONFIG_SUSPEND extern suspend_state_t pm_suspend_target_state; extern suspend_state_t mem_sleep_current; extern suspend_state_t mem_sleep_default; /** * suspend_set_ops - set platform dependent suspend operations * @ops: The new suspend operations to set. */ extern void suspend_set_ops(const struct platform_suspend_ops *ops); extern int suspend_valid_only_mem(suspend_state_t state); extern unsigned int pm_suspend_global_flags; #define PM_SUSPEND_FLAG_FW_SUSPEND BIT(0) #define PM_SUSPEND_FLAG_FW_RESUME BIT(1) #define PM_SUSPEND_FLAG_NO_PLATFORM BIT(2) static inline void pm_suspend_clear_flags(void) { pm_suspend_global_flags = 0; } static inline void pm_set_suspend_via_firmware(void) { pm_suspend_global_flags |= PM_SUSPEND_FLAG_FW_SUSPEND; } static inline void pm_set_resume_via_firmware(void) { pm_suspend_global_flags |= PM_SUSPEND_FLAG_FW_RESUME; } static inline void pm_set_suspend_no_platform(void) { pm_suspend_global_flags |= PM_SUSPEND_FLAG_NO_PLATFORM; } /** * pm_suspend_via_firmware - Check if platform firmware will suspend the system. * * To be called during system-wide power management transitions to sleep states * or during the subsequent system-wide transitions back to the working state. * * Return 'true' if the platform firmware is going to be invoked at the end of * the system-wide power management transition (to a sleep state) in progress in * order to complete it, or if the platform firmware has been invoked in order * to complete the last (or preceding) transition of the system to a sleep * state. * * This matters if the caller needs or wants to carry out some special actions * depending on whether or not control will be passed to the platform firmware * subsequently (for example, the device may need to be reset before letting the * platform firmware manipulate it, which is not necessary when the platform * firmware is not going to be invoked) or when such special actions may have * been carried out during the preceding transition of the system to a sleep * state (as they may need to be taken into account). */ static inline bool pm_suspend_via_firmware(void) { return !!(pm_suspend_global_flags & PM_SUSPEND_FLAG_FW_SUSPEND); } /** * pm_resume_via_firmware - Check if platform firmware has woken up the system. * * To be called during system-wide power management transitions from sleep * states. * * Return 'true' if the platform firmware has passed control to the kernel at * the beginning of the system-wide power management transition in progress, so * the event that woke up the system from sleep has been handled by the platform * firmware. */ static inline bool pm_resume_via_firmware(void) { return !!(pm_suspend_global_flags & PM_SUSPEND_FLAG_FW_RESUME); } /** * pm_suspend_no_platform - Check if platform may change device power states. * * To be called during system-wide power management transitions to sleep states * or during the subsequent system-wide transitions back to the working state. * * Return 'true' if the power states of devices remain under full control of the * kernel throughout the system-wide suspend and resume cycle in progress (that * is, if a device is put into a certain power state during suspend, it can be * expected to remain in that state during resume). */ static inline bool pm_suspend_no_platform(void) { return !!(pm_suspend_global_flags & PM_SUSPEND_FLAG_NO_PLATFORM); } /* Suspend-to-idle state machnine. */ enum s2idle_states { S2IDLE_STATE_NONE, /* Not suspended/suspending. */ S2IDLE_STATE_ENTER, /* Enter suspend-to-idle. */ S2IDLE_STATE_WAKE, /* Wake up from suspend-to-idle. */ }; extern enum s2idle_states __read_mostly s2idle_state; static inline bool idle_should_enter_s2idle(void) { return unlikely(s2idle_state == S2IDLE_STATE_ENTER); } extern bool pm_suspend_default_s2idle(void); extern void __init pm_states_init(void); extern void s2idle_set_ops(const struct platform_s2idle_ops *ops); extern void s2idle_wake(void); /** * arch_suspend_disable_irqs - disable IRQs for suspend * * Disables IRQs (in the default case). This is a weak symbol in the common * code and thus allows architectures to override it if more needs to be * done. Not called for suspend to disk. */ extern void arch_suspend_disable_irqs(void); /** * arch_suspend_enable_irqs - enable IRQs after suspend * * Enables IRQs (in the default case). This is a weak symbol in the common * code and thus allows architectures to override it if more needs to be * done. Not called for suspend to disk. */ extern void arch_suspend_enable_irqs(void); extern int pm_suspend(suspend_state_t state); extern bool sync_on_suspend_enabled; #else /* !CONFIG_SUSPEND */ #define suspend_valid_only_mem NULL #define pm_suspend_target_state (PM_SUSPEND_ON) static inline void pm_suspend_clear_flags(void) {} static inline void pm_set_suspend_via_firmware(void) {} static inline void pm_set_resume_via_firmware(void) {} static inline bool pm_suspend_via_firmware(void) { return false; } static inline bool pm_resume_via_firmware(void) { return false; } static inline bool pm_suspend_no_platform(void) { return false; } static inline bool pm_suspend_default_s2idle(void) { return false; } static inline void suspend_set_ops(const struct platform_suspend_ops *ops) {} static inline int pm_suspend(suspend_state_t state) { return -ENOSYS; } static inline bool sync_on_suspend_enabled(void) { return true; } static inline bool idle_should_enter_s2idle(void) { return false; } static inline void __init pm_states_init(void) {} static inline void s2idle_set_ops(const struct platform_s2idle_ops *ops) {} static inline void s2idle_wake(void) {} #endif /* !CONFIG_SUSPEND */ static inline bool pm_suspend_in_progress(void) { return pm_suspend_target_state != PM_SUSPEND_ON; } /* struct pbe is used for creating lists of pages that should be restored * atomically during the resume from disk, because the page frames they have * occupied before the suspend are in use. */ struct pbe { void *address; /* address of the copy */ void *orig_address; /* original address of a page */ struct pbe *next; }; /** * struct platform_hibernation_ops - hibernation platform support * * The methods in this structure allow a platform to carry out special * operations required by it during a hibernation transition. * * All the methods below, except for @recover(), must be implemented. * * @begin: Tell the platform driver that we're starting hibernation. * Called right after shrinking memory and before freezing devices. * * @end: Called by the PM core right after resuming devices, to indicate to * the platform that the system has returned to the working state. * * @pre_snapshot: Prepare the platform for creating the hibernation image. * Called right after devices have been frozen and before the nonboot * CPUs are disabled (runs with IRQs on). * * @finish: Restore the previous state of the platform after the hibernation * image has been created *or* put the platform into the normal operation * mode after the hibernation (the same method is executed in both cases). * Called right after the nonboot CPUs have been enabled and before * thawing devices (runs with IRQs on). * * @prepare: Prepare the platform for entering the low power state. * Called right after the hibernation image has been saved and before * devices are prepared for entering the low power state. * * @enter: Put the system into the low power state after the hibernation image * has been saved to disk. * Called after the nonboot CPUs have been disabled and all of the low * level devices have been shut down (runs with IRQs off). * * @leave: Perform the first stage of the cleanup after the system sleep state * indicated by @set_target() has been left. * Called right after the control has been passed from the boot kernel to * the image kernel, before the nonboot CPUs are enabled and before devices * are resumed. Executed with interrupts disabled. * * @pre_restore: Prepare system for the restoration from a hibernation image. * Called right after devices have been frozen and before the nonboot * CPUs are disabled (runs with IRQs on). * * @restore_cleanup: Clean up after a failing image restoration. * Called right after the nonboot CPUs have been enabled and before * thawing devices (runs with IRQs on). * * @recover: Recover the platform from a failure to suspend devices. * Called by the PM core if the suspending of devices during hibernation * fails. This callback is optional and should only be implemented by * platforms which require special recovery actions in that situation. */ struct platform_hibernation_ops { int (*begin)(pm_message_t stage); void (*end)(void); int (*pre_snapshot)(void); void (*finish)(void); int (*prepare)(void); int (*enter)(void); void (*leave)(void); int (*pre_restore)(void); void (*restore_cleanup)(void); void (*recover)(void); }; #ifdef CONFIG_HIBERNATION /* kernel/power/snapshot.c */ extern void register_nosave_region(unsigned long b, unsigned long e); extern int swsusp_page_is_forbidden(struct page *); extern void swsusp_set_page_free(struct page *); extern void swsusp_unset_page_free(struct page *); extern unsigned long get_safe_page(gfp_t gfp_mask); extern asmlinkage int swsusp_arch_suspend(void); extern asmlinkage int swsusp_arch_resume(void); extern u32 swsusp_hardware_signature; extern void hibernation_set_ops(const struct platform_hibernation_ops *ops); extern int hibernate(void); extern bool system_entering_hibernation(void); extern bool hibernation_available(void); asmlinkage int swsusp_save(void); extern struct pbe *restore_pblist; int pfn_is_nosave(unsigned long pfn); int hibernate_quiet_exec(int (*func)(void *data), void *data); int hibernate_resume_nonboot_cpu_disable(void); int arch_hibernation_header_save(void *addr, unsigned int max_size); int arch_hibernation_header_restore(void *addr); #else /* CONFIG_HIBERNATION */ static inline void register_nosave_region(unsigned long b, unsigned long e) {} static inline int swsusp_page_is_forbidden(struct page *p) { return 0; } static inline void swsusp_set_page_free(struct page *p) {} static inline void swsusp_unset_page_free(struct page *p) {} static inline void hibernation_set_ops(const struct platform_hibernation_ops *ops) {} static inline int hibernate(void) { return -ENOSYS; } static inline bool system_entering_hibernation(void) { return false; } static inline bool hibernation_available(void) { return false; } static inline int hibernate_quiet_exec(int (*func)(void *data), void *data) { return -ENOTSUPP; } #endif /* CONFIG_HIBERNATION */ int arch_resume_nosmt(void); #ifdef CONFIG_HIBERNATION_SNAPSHOT_DEV int is_hibernate_resume_dev(dev_t dev); #else static inline int is_hibernate_resume_dev(dev_t dev) { return 0; } #endif /* Hibernation and suspend events */ #define PM_HIBERNATION_PREPARE 0x0001 /* Going to hibernate */ #define PM_POST_HIBERNATION 0x0002 /* Hibernation finished */ #define PM_SUSPEND_PREPARE 0x0003 /* Going to suspend the system */ #define PM_POST_SUSPEND 0x0004 /* Suspend finished */ #define PM_RESTORE_PREPARE 0x0005 /* Going to restore a saved image */ #define PM_POST_RESTORE 0x0006 /* Restore failed */ extern struct mutex system_transition_mutex; #ifdef CONFIG_PM_SLEEP void save_processor_state(void); void restore_processor_state(void); /* kernel/power/main.c */ extern int register_pm_notifier(struct notifier_block *nb); extern int unregister_pm_notifier(struct notifier_block *nb); extern void ksys_sync_helper(void); extern void pm_report_hw_sleep_time(u64 t); extern void pm_report_max_hw_sleep(u64 t); void pm_restrict_gfp_mask(void); void pm_restore_gfp_mask(void); #define pm_notifier(fn, pri) { \ static struct notifier_block fn##_nb = \ { .notifier_call = fn, .priority = pri }; \ register_pm_notifier(&fn##_nb); \ } /* drivers/base/power/wakeup.c */ extern bool events_check_enabled; static inline bool pm_suspended_storage(void) { return !gfp_has_io_fs(gfp_allowed_mask); } extern bool pm_wakeup_pending(void); extern void pm_system_wakeup(void); extern void pm_system_cancel_wakeup(void); extern void pm_wakeup_clear(unsigned int irq_number); extern void pm_system_irq_wakeup(unsigned int irq_number); extern unsigned int pm_wakeup_irq(void); extern bool pm_get_wakeup_count(unsigned int *count, bool block); extern bool pm_save_wakeup_count(unsigned int count); extern void pm_wakep_autosleep_enabled(bool set); extern void pm_print_active_wakeup_sources(void); extern unsigned int lock_system_sleep(void); extern void unlock_system_sleep(unsigned int); extern bool pm_sleep_transition_in_progress(void); bool pm_hibernate_is_recovering(void); #else /* !CONFIG_PM_SLEEP */ static inline int register_pm_notifier(struct notifier_block *nb) { return 0; } static inline int unregister_pm_notifier(struct notifier_block *nb) { return 0; } static inline void pm_report_hw_sleep_time(u64 t) {}; static inline void pm_report_max_hw_sleep(u64 t) {}; static inline void pm_restrict_gfp_mask(void) {} static inline void pm_restore_gfp_mask(void) {} static inline void ksys_sync_helper(void) {} #define pm_notifier(fn, pri) do { (void)(fn); } while (0) static inline bool pm_suspended_storage(void) { return false; } static inline bool pm_wakeup_pending(void) { return false; } static inline void pm_system_wakeup(void) {} static inline void pm_wakeup_clear(bool reset) {} static inline void pm_system_irq_wakeup(unsigned int irq_number) {} static inline unsigned int lock_system_sleep(void) { return 0; } static inline void unlock_system_sleep(unsigned int flags) {} static inline bool pm_sleep_transition_in_progress(void) { return false; } static inline bool pm_hibernate_is_recovering(void) { return false; } #endif /* !CONFIG_PM_SLEEP */ #ifdef CONFIG_PM_SLEEP_DEBUG extern bool pm_print_times_enabled; extern bool pm_debug_messages_on; extern bool pm_debug_messages_should_print(void); static inline int pm_dyn_debug_messages_on(void) { #ifdef CONFIG_DYNAMIC_DEBUG return 1; #else return 0; #endif } #ifndef pr_fmt #define pr_fmt(fmt) "PM: " fmt #endif #define __pm_pr_dbg(fmt, ...) \ do { \ if (pm_debug_messages_should_print()) \ printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ else if (pm_dyn_debug_messages_on()) \ pr_debug(fmt, ##__VA_ARGS__); \ } while (0) #define __pm_deferred_pr_dbg(fmt, ...) \ do { \ if (pm_debug_messages_should_print()) \ printk_deferred(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ } while (0) #else #define pm_print_times_enabled (false) #define pm_debug_messages_on (false) #include <linux/printk.h> #define __pm_pr_dbg(fmt, ...) \ no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #define __pm_deferred_pr_dbg(fmt, ...) \ no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #endif /** * pm_pr_dbg - print pm sleep debug messages * * If pm_debug_messages_on is enabled and the system is entering/leaving * suspend, print message. * If pm_debug_messages_on is disabled and CONFIG_DYNAMIC_DEBUG is enabled, * print message only from instances explicitly enabled on dynamic debug's * control. * If pm_debug_messages_on is disabled and CONFIG_DYNAMIC_DEBUG is disabled, * don't print message. */ #define pm_pr_dbg(fmt, ...) \ __pm_pr_dbg(fmt, ##__VA_ARGS__) #define pm_deferred_pr_dbg(fmt, ...) \ __pm_deferred_pr_dbg(fmt, ##__VA_ARGS__) #ifdef CONFIG_PM_AUTOSLEEP /* kernel/power/autosleep.c */ void queue_up_suspend_work(void); #else /* !CONFIG_PM_AUTOSLEEP */ static inline void queue_up_suspend_work(void) {} #endif /* !CONFIG_PM_AUTOSLEEP */ enum suspend_stat_step { SUSPEND_WORKING = 0, SUSPEND_FREEZE, SUSPEND_PREPARE, SUSPEND_SUSPEND, SUSPEND_SUSPEND_LATE, SUSPEND_SUSPEND_NOIRQ, SUSPEND_RESUME_NOIRQ, SUSPEND_RESUME_EARLY, SUSPEND_RESUME }; void dpm_save_failed_dev(const char *name); void dpm_save_failed_step(enum suspend_stat_step step); #endif /* _LINUX_SUSPEND_H */
23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2021 Intel Corporation */ #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include "hci_codec.h" static int hci_codec_list_add(struct list_head *list, struct hci_op_read_local_codec_caps *sent, struct hci_rp_read_local_codec_caps *rp, void *caps, __u32 len) { struct codec_list *entry; entry = kzalloc(sizeof(*entry) + len, GFP_KERNEL); if (!entry) return -ENOMEM; entry->id = sent->id; if (sent->id == 0xFF) { entry->cid = __le16_to_cpu(sent->cid); entry->vid = __le16_to_cpu(sent->vid); } entry->transport = sent->transport; entry->len = len; entry->num_caps = 0; if (rp) { entry->num_caps = rp->num_caps; memcpy(entry->caps, caps, len); } list_add(&entry->list, list); return 0; } void hci_codec_list_clear(struct list_head *codec_list) { struct codec_list *c, *n; list_for_each_entry_safe(c, n, codec_list, list) { list_del(&c->list); kfree(c); } } static void hci_read_codec_capabilities(struct hci_dev *hdev, __u8 transport, struct hci_op_read_local_codec_caps *cmd) { __u8 i; for (i = 0; i < TRANSPORT_TYPE_MAX; i++) { if (transport & BIT(i)) { struct hci_rp_read_local_codec_caps *rp; struct hci_codec_caps *caps; struct sk_buff *skb; __u8 j; __u32 len; cmd->transport = i; /* If Read_Codec_Capabilities command is not supported * then just add codec to the list without caps */ if (!(hdev->commands[45] & 0x08)) { hci_dev_lock(hdev); hci_codec_list_add(&hdev->local_codecs, cmd, NULL, NULL, 0); hci_dev_unlock(hdev); continue; } skb = __hci_cmd_sync_sk(hdev, HCI_OP_READ_LOCAL_CODEC_CAPS, sizeof(*cmd), cmd, 0, HCI_CMD_TIMEOUT, NULL); if (IS_ERR(skb)) { bt_dev_err(hdev, "Failed to read codec capabilities (%ld)", PTR_ERR(skb)); continue; } if (skb->len < sizeof(*rp)) goto error; rp = (void *)skb->data; if (rp->status) goto error; if (!rp->num_caps) { len = 0; /* this codec doesn't have capabilities */ goto skip_caps_parse; } skb_pull(skb, sizeof(*rp)); for (j = 0, len = 0; j < rp->num_caps; j++) { caps = (void *)skb->data; if (skb->len < sizeof(*caps)) goto error; if (skb->len < caps->len) goto error; len += sizeof(caps->len) + caps->len; skb_pull(skb, sizeof(caps->len) + caps->len); } skip_caps_parse: hci_dev_lock(hdev); hci_codec_list_add(&hdev->local_codecs, cmd, rp, (__u8 *)rp + sizeof(*rp), len); hci_dev_unlock(hdev); error: kfree_skb(skb); } } } void hci_read_supported_codecs(struct hci_dev *hdev) { struct sk_buff *skb; struct hci_rp_read_local_supported_codecs *rp; struct hci_std_codecs *std_codecs; struct hci_vnd_codecs *vnd_codecs; struct hci_op_read_local_codec_caps caps; __u8 i; skb = __hci_cmd_sync_sk(hdev, HCI_OP_READ_LOCAL_CODECS, 0, NULL, 0, HCI_CMD_TIMEOUT, NULL); if (IS_ERR(skb)) { bt_dev_err(hdev, "Failed to read local supported codecs (%ld)", PTR_ERR(skb)); return; } if (skb->len < sizeof(*rp)) goto error; rp = (void *)skb->data; if (rp->status) goto error; skb_pull(skb, sizeof(rp->status)); std_codecs = (void *)skb->data; /* validate codecs length before accessing */ if (skb->len < flex_array_size(std_codecs, codec, std_codecs->num) + sizeof(std_codecs->num)) goto error; /* enumerate codec capabilities of standard codecs */ memset(&caps, 0, sizeof(caps)); for (i = 0; i < std_codecs->num; i++) { caps.id = std_codecs->codec[i]; caps.direction = 0x00; hci_read_codec_capabilities(hdev, LOCAL_CODEC_ACL_MASK | LOCAL_CODEC_SCO_MASK, &caps); } skb_pull(skb, flex_array_size(std_codecs, codec, std_codecs->num) + sizeof(std_codecs->num)); vnd_codecs = (void *)skb->data; /* validate vendor codecs length before accessing */ if (skb->len < flex_array_size(vnd_codecs, codec, vnd_codecs->num) + sizeof(vnd_codecs->num)) goto error; /* enumerate vendor codec capabilities */ for (i = 0; i < vnd_codecs->num; i++) { caps.id = 0xFF; caps.cid = vnd_codecs->codec[i].cid; caps.vid = vnd_codecs->codec[i].vid; caps.direction = 0x00; hci_read_codec_capabilities(hdev, LOCAL_CODEC_ACL_MASK | LOCAL_CODEC_SCO_MASK, &caps); } error: kfree_skb(skb); } void hci_read_supported_codecs_v2(struct hci_dev *hdev) { struct sk_buff *skb; struct hci_rp_read_local_supported_codecs_v2 *rp; struct hci_std_codecs_v2 *std_codecs; struct hci_vnd_codecs_v2 *vnd_codecs; struct hci_op_read_local_codec_caps caps; __u8 i; skb = __hci_cmd_sync_sk(hdev, HCI_OP_READ_LOCAL_CODECS_V2, 0, NULL, 0, HCI_CMD_TIMEOUT, NULL); if (IS_ERR(skb)) { bt_dev_err(hdev, "Failed to read local supported codecs (%ld)", PTR_ERR(skb)); return; } if (skb->len < sizeof(*rp)) goto error; rp = (void *)skb->data; if (rp->status) goto error; skb_pull(skb, sizeof(rp->status)); std_codecs = (void *)skb->data; /* check for payload data length before accessing */ if (skb->len < flex_array_size(std_codecs, codec, std_codecs->num) + sizeof(std_codecs->num)) goto error; memset(&caps, 0, sizeof(caps)); for (i = 0; i < std_codecs->num; i++) { caps.id = std_codecs->codec[i].id; hci_read_codec_capabilities(hdev, std_codecs->codec[i].transport, &caps); } skb_pull(skb, flex_array_size(std_codecs, codec, std_codecs->num) + sizeof(std_codecs->num)); vnd_codecs = (void *)skb->data; /* check for payload data length before accessing */ if (skb->len < flex_array_size(vnd_codecs, codec, vnd_codecs->num) + sizeof(vnd_codecs->num)) goto error; for (i = 0; i < vnd_codecs->num; i++) { caps.id = 0xFF; caps.cid = vnd_codecs->codec[i].cid; caps.vid = vnd_codecs->codec[i].vid; hci_read_codec_capabilities(hdev, vnd_codecs->codec[i].transport, &caps); } error: kfree_skb(skb); }
9 43 43 43 43 41 43 3 3 22 22 22 9 22 15 15 15 6 5 2 2 8 8 8 8 7 8 2 8 6 6 6 8 8 8 10 10 6 8 2 6 8 10 8 1 1 7 1 1 6 8 3 3 5 5 3 3 3 3 9 9 8 6 7 1 1 6 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 // SPDX-License-Identifier: GPL-2.0 /* Copyright 2011-2014 Autronica Fire and Security AS * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * * The HSR spec says never to forward the same frame twice on the same * interface. A frame is identified by its source MAC address and its HSR * sequence number. This code keeps track of senders and their sequence numbers * to allow filtering of duplicate frames, and to detect HSR ring errors. * Same code handles filtering of duplicates for PRP as well. */ #include <linux/if_ether.h> #include <linux/etherdevice.h> #include <linux/slab.h> #include <linux/rculist.h> #include "hsr_main.h" #include "hsr_framereg.h" #include "hsr_netlink.h" /* seq_nr_after(a, b) - return true if a is after (higher in sequence than) b, * false otherwise. */ static bool seq_nr_after(u16 a, u16 b) { /* Remove inconsistency where * seq_nr_after(a, b) == seq_nr_before(a, b) */ if ((int)b - a == 32768) return false; return (((s16)(b - a)) < 0); } #define seq_nr_before(a, b) seq_nr_after((b), (a)) #define seq_nr_before_or_eq(a, b) (!seq_nr_after((a), (b))) #define PRP_DROP_WINDOW_LEN 32768 bool hsr_addr_is_redbox(struct hsr_priv *hsr, unsigned char *addr) { if (!hsr->redbox || !is_valid_ether_addr(hsr->macaddress_redbox)) return false; return ether_addr_equal(addr, hsr->macaddress_redbox); } bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr) { struct hsr_self_node *sn; bool ret = false; rcu_read_lock(); sn = rcu_dereference(hsr->self_node); if (!sn) { WARN_ONCE(1, "HSR: No self node\n"); goto out; } if (ether_addr_equal(addr, sn->macaddress_A) || ether_addr_equal(addr, sn->macaddress_B)) ret = true; out: rcu_read_unlock(); return ret; } /* Search for mac entry. Caller must hold rcu read lock. */ static struct hsr_node *find_node_by_addr_A(struct list_head *node_db, const unsigned char addr[ETH_ALEN]) { struct hsr_node *node; list_for_each_entry_rcu(node, node_db, mac_list) { if (ether_addr_equal(node->macaddress_A, addr)) return node; } return NULL; } /* Check if node for a given MAC address is already present in data base */ bool hsr_is_node_in_db(struct list_head *node_db, const unsigned char addr[ETH_ALEN]) { return !!find_node_by_addr_A(node_db, addr); } /* Helper for device init; the self_node is used in hsr_rcv() to recognize * frames from self that's been looped over the HSR ring. */ int hsr_create_self_node(struct hsr_priv *hsr, const unsigned char addr_a[ETH_ALEN], const unsigned char addr_b[ETH_ALEN]) { struct hsr_self_node *sn, *old; sn = kmalloc(sizeof(*sn), GFP_KERNEL); if (!sn) return -ENOMEM; ether_addr_copy(sn->macaddress_A, addr_a); ether_addr_copy(sn->macaddress_B, addr_b); spin_lock_bh(&hsr->list_lock); old = rcu_replace_pointer(hsr->self_node, sn, lockdep_is_held(&hsr->list_lock)); spin_unlock_bh(&hsr->list_lock); if (old) kfree_rcu(old, rcu_head); return 0; } void hsr_del_self_node(struct hsr_priv *hsr) { struct hsr_self_node *old; spin_lock_bh(&hsr->list_lock); old = rcu_replace_pointer(hsr->self_node, NULL, lockdep_is_held(&hsr->list_lock)); spin_unlock_bh(&hsr->list_lock); if (old) kfree_rcu(old, rcu_head); } void hsr_del_nodes(struct list_head *node_db) { struct hsr_node *node; struct hsr_node *tmp; list_for_each_entry_safe(node, tmp, node_db, mac_list) kfree(node); } void prp_handle_san_frame(bool san, enum hsr_port_type port, struct hsr_node *node) { /* Mark if the SAN node is over LAN_A or LAN_B */ if (port == HSR_PT_SLAVE_A) { node->san_a = true; return; } if (port == HSR_PT_SLAVE_B) node->san_b = true; } /* Allocate an hsr_node and add it to node_db. 'addr' is the node's address_A; * seq_out is used to initialize filtering of outgoing duplicate frames * originating from the newly added node. */ static struct hsr_node *hsr_add_node(struct hsr_priv *hsr, struct list_head *node_db, unsigned char addr[], u16 seq_out, bool san, enum hsr_port_type rx_port) { struct hsr_node *new_node, *node; unsigned long now; int i; new_node = kzalloc(sizeof(*new_node), GFP_ATOMIC); if (!new_node) return NULL; ether_addr_copy(new_node->macaddress_A, addr); spin_lock_init(&new_node->seq_out_lock); /* We are only interested in time diffs here, so use current jiffies * as initialization. (0 could trigger an spurious ring error warning). */ now = jiffies; for (i = 0; i < HSR_PT_PORTS; i++) { new_node->time_in[i] = now; new_node->time_out[i] = now; } for (i = 0; i < HSR_PT_PORTS; i++) { new_node->seq_out[i] = seq_out; new_node->seq_expected[i] = seq_out + 1; new_node->seq_start[i] = seq_out + 1; } if (san && hsr->proto_ops->handle_san_frame) hsr->proto_ops->handle_san_frame(san, rx_port, new_node); spin_lock_bh(&hsr->list_lock); list_for_each_entry_rcu(node, node_db, mac_list, lockdep_is_held(&hsr->list_lock)) { if (ether_addr_equal(node->macaddress_A, addr)) goto out; if (ether_addr_equal(node->macaddress_B, addr)) goto out; } list_add_tail_rcu(&new_node->mac_list, node_db); spin_unlock_bh(&hsr->list_lock); return new_node; out: spin_unlock_bh(&hsr->list_lock); kfree(new_node); return node; } void prp_update_san_info(struct hsr_node *node, bool is_sup) { if (!is_sup) return; node->san_a = false; node->san_b = false; } /* Get the hsr_node from which 'skb' was sent. */ struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db, struct sk_buff *skb, bool is_sup, enum hsr_port_type rx_port) { struct hsr_priv *hsr = port->hsr; struct hsr_node *node; struct ethhdr *ethhdr; struct prp_rct *rct; bool san = false; u16 seq_out; if (!skb_mac_header_was_set(skb)) return NULL; ethhdr = (struct ethhdr *)skb_mac_header(skb); list_for_each_entry_rcu(node, node_db, mac_list) { if (ether_addr_equal(node->macaddress_A, ethhdr->h_source)) { if (hsr->proto_ops->update_san_info) hsr->proto_ops->update_san_info(node, is_sup); return node; } if (ether_addr_equal(node->macaddress_B, ethhdr->h_source)) { if (hsr->proto_ops->update_san_info) hsr->proto_ops->update_san_info(node, is_sup); return node; } } /* Check if required node is not in proxy nodes table */ list_for_each_entry_rcu(node, &hsr->proxy_node_db, mac_list) { if (ether_addr_equal(node->macaddress_A, ethhdr->h_source)) { if (hsr->proto_ops->update_san_info) hsr->proto_ops->update_san_info(node, is_sup); return node; } } /* Everyone may create a node entry, connected node to a HSR/PRP * device. */ if (ethhdr->h_proto == htons(ETH_P_PRP) || ethhdr->h_proto == htons(ETH_P_HSR)) { /* Check if skb contains hsr_ethhdr */ if (skb->mac_len < sizeof(struct hsr_ethhdr)) return NULL; /* Use the existing sequence_nr from the tag as starting point * for filtering duplicate frames. */ seq_out = hsr_get_skb_sequence_nr(skb) - 1; } else { rct = skb_get_PRP_rct(skb); if (rct && prp_check_lsdu_size(skb, rct, is_sup)) { seq_out = prp_get_skb_sequence_nr(rct); } else { if (rx_port != HSR_PT_MASTER) san = true; seq_out = HSR_SEQNR_START; } } return hsr_add_node(hsr, node_db, ethhdr->h_source, seq_out, san, rx_port); } /* Use the Supervision frame's info about an eventual macaddress_B for merging * nodes that has previously had their macaddress_B registered as a separate * node. */ void hsr_handle_sup_frame(struct hsr_frame_info *frame) { struct hsr_node *node_curr = frame->node_src; struct hsr_port *port_rcv = frame->port_rcv; struct hsr_priv *hsr = port_rcv->hsr; struct hsr_sup_payload *hsr_sp; struct hsr_sup_tlv *hsr_sup_tlv; struct hsr_node *node_real; struct sk_buff *skb = NULL; struct list_head *node_db; struct ethhdr *ethhdr; int i; unsigned int pull_size = 0; unsigned int total_pull_size = 0; /* Here either frame->skb_hsr or frame->skb_prp should be * valid as supervision frame always will have protocol * header info. */ if (frame->skb_hsr) skb = frame->skb_hsr; else if (frame->skb_prp) skb = frame->skb_prp; else if (frame->skb_std) skb = frame->skb_std; if (!skb) return; /* Leave the ethernet header. */ pull_size = sizeof(struct ethhdr); skb_pull(skb, pull_size); total_pull_size += pull_size; ethhdr = (struct ethhdr *)skb_mac_header(skb); /* And leave the HSR tag. */ if (ethhdr->h_proto == htons(ETH_P_HSR)) { pull_size = sizeof(struct hsr_tag); skb_pull(skb, pull_size); total_pull_size += pull_size; } /* And leave the HSR sup tag. */ pull_size = sizeof(struct hsr_sup_tag); skb_pull(skb, pull_size); total_pull_size += pull_size; /* get HSR sup payload */ hsr_sp = (struct hsr_sup_payload *)skb->data; /* Merge node_curr (registered on macaddress_B) into node_real */ node_db = &port_rcv->hsr->node_db; node_real = find_node_by_addr_A(node_db, hsr_sp->macaddress_A); if (!node_real) /* No frame received from AddrA of this node yet */ node_real = hsr_add_node(hsr, node_db, hsr_sp->macaddress_A, HSR_SEQNR_START - 1, true, port_rcv->type); if (!node_real) goto done; /* No mem */ if (node_real == node_curr) /* Node has already been merged */ goto done; /* Leave the first HSR sup payload. */ pull_size = sizeof(struct hsr_sup_payload); skb_pull(skb, pull_size); total_pull_size += pull_size; /* Get second supervision tlv */ hsr_sup_tlv = (struct hsr_sup_tlv *)skb->data; /* And check if it is a redbox mac TLV */ if (hsr_sup_tlv->HSR_TLV_type == PRP_TLV_REDBOX_MAC) { /* We could stop here after pushing hsr_sup_payload, * or proceed and allow macaddress_B and for redboxes. */ /* Sanity check length */ if (hsr_sup_tlv->HSR_TLV_length != 6) goto done; /* Leave the second HSR sup tlv. */ pull_size = sizeof(struct hsr_sup_tlv); skb_pull(skb, pull_size); total_pull_size += pull_size; /* Get redbox mac address. */ hsr_sp = (struct hsr_sup_payload *)skb->data; /* Check if redbox mac and node mac are equal. */ if (!ether_addr_equal(node_real->macaddress_A, hsr_sp->macaddress_A)) { /* This is a redbox supervision frame for a VDAN! */ goto done; } } ether_addr_copy(node_real->macaddress_B, ethhdr->h_source); spin_lock_bh(&node_real->seq_out_lock); for (i = 0; i < HSR_PT_PORTS; i++) { if (!node_curr->time_in_stale[i] && time_after(node_curr->time_in[i], node_real->time_in[i])) { node_real->time_in[i] = node_curr->time_in[i]; node_real->time_in_stale[i] = node_curr->time_in_stale[i]; } if (seq_nr_after(node_curr->seq_out[i], node_real->seq_out[i])) node_real->seq_out[i] = node_curr->seq_out[i]; } spin_unlock_bh(&node_real->seq_out_lock); node_real->addr_B_port = port_rcv->type; spin_lock_bh(&hsr->list_lock); if (!node_curr->removed) { list_del_rcu(&node_curr->mac_list); node_curr->removed = true; kfree_rcu(node_curr, rcu_head); } spin_unlock_bh(&hsr->list_lock); done: /* Push back here */ skb_push(skb, total_pull_size); } /* 'skb' is a frame meant for this host, that is to be passed to upper layers. * * If the frame was sent by a node's B interface, replace the source * address with that node's "official" address (macaddress_A) so that upper * layers recognize where it came from. */ void hsr_addr_subst_source(struct hsr_node *node, struct sk_buff *skb) { if (!skb_mac_header_was_set(skb)) { WARN_ONCE(1, "%s: Mac header not set\n", __func__); return; } memcpy(&eth_hdr(skb)->h_source, node->macaddress_A, ETH_ALEN); } /* 'skb' is a frame meant for another host. * 'port' is the outgoing interface * * Substitute the target (dest) MAC address if necessary, so the it matches the * recipient interface MAC address, regardless of whether that is the * recipient's A or B interface. * This is needed to keep the packets flowing through switches that learn on * which "side" the different interfaces are. */ void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb, struct hsr_port *port) { struct hsr_node *node_dst; if (!skb_mac_header_was_set(skb)) { WARN_ONCE(1, "%s: Mac header not set\n", __func__); return; } if (!is_unicast_ether_addr(eth_hdr(skb)->h_dest)) return; node_dst = find_node_by_addr_A(&port->hsr->node_db, eth_hdr(skb)->h_dest); if (!node_dst && port->hsr->redbox) node_dst = find_node_by_addr_A(&port->hsr->proxy_node_db, eth_hdr(skb)->h_dest); if (!node_dst) { if (port->hsr->prot_version != PRP_V1 && net_ratelimit()) netdev_err(skb->dev, "%s: Unknown node\n", __func__); return; } if (port->type != node_dst->addr_B_port) return; if (is_valid_ether_addr(node_dst->macaddress_B)) ether_addr_copy(eth_hdr(skb)->h_dest, node_dst->macaddress_B); } void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port, u16 sequence_nr) { /* Don't register incoming frames without a valid sequence number. This * ensures entries of restarted nodes gets pruned so that they can * re-register and resume communications. */ if (!(port->dev->features & NETIF_F_HW_HSR_TAG_RM) && seq_nr_before(sequence_nr, node->seq_out[port->type])) return; node->time_in[port->type] = jiffies; node->time_in_stale[port->type] = false; } /* 'skb' is a HSR Ethernet frame (with a HSR tag inserted), with a valid * ethhdr->h_source address and skb->mac_header set. * * Return: * 1 if frame can be shown to have been sent recently on this interface, * 0 otherwise, or * negative error code on error */ int hsr_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame) { struct hsr_node *node = frame->node_src; u16 sequence_nr = frame->sequence_nr; spin_lock_bh(&node->seq_out_lock); if (seq_nr_before_or_eq(sequence_nr, node->seq_out[port->type]) && time_is_after_jiffies(node->time_out[port->type] + msecs_to_jiffies(HSR_ENTRY_FORGET_TIME))) { spin_unlock_bh(&node->seq_out_lock); return 1; } node->time_out[port->type] = jiffies; node->seq_out[port->type] = sequence_nr; spin_unlock_bh(&node->seq_out_lock); return 0; } /* Adaptation of the PRP duplicate discard algorithm described in wireshark * wiki (https://wiki.wireshark.org/PRP) * * A drop window is maintained for both LANs with start sequence set to the * first sequence accepted on the LAN that has not been seen on the other LAN, * and expected sequence set to the latest received sequence number plus one. * * When a frame is received on either LAN it is compared against the received * frames on the other LAN. If it is outside the drop window of the other LAN * the frame is accepted and the drop window is updated. * The drop window for the other LAN is reset. * * 'port' is the outgoing interface * 'frame' is the frame to be sent * * Return: * 1 if frame can be shown to have been sent recently on this interface, * 0 otherwise */ int prp_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame) { enum hsr_port_type other_port; enum hsr_port_type rcv_port; struct hsr_node *node; u16 sequence_diff; u16 sequence_exp; u16 sequence_nr; /* out-going frames are always in order * and can be checked the same way as for HSR */ if (frame->port_rcv->type == HSR_PT_MASTER) return hsr_register_frame_out(port, frame); /* for PRP we should only forward frames from the slave ports * to the master port */ if (port->type != HSR_PT_MASTER) return 1; node = frame->node_src; sequence_nr = frame->sequence_nr; sequence_exp = sequence_nr + 1; rcv_port = frame->port_rcv->type; other_port = rcv_port == HSR_PT_SLAVE_A ? HSR_PT_SLAVE_B : HSR_PT_SLAVE_A; spin_lock_bh(&node->seq_out_lock); if (time_is_before_jiffies(node->time_out[port->type] + msecs_to_jiffies(HSR_ENTRY_FORGET_TIME)) || (node->seq_start[rcv_port] == node->seq_expected[rcv_port] && node->seq_start[other_port] == node->seq_expected[other_port])) { /* the node hasn't been sending for a while * or both drop windows are empty, forward the frame */ node->seq_start[rcv_port] = sequence_nr; } else if (seq_nr_before(sequence_nr, node->seq_expected[other_port]) && seq_nr_before_or_eq(node->seq_start[other_port], sequence_nr)) { /* drop the frame, update the drop window for the other port * and reset our drop window */ node->seq_start[other_port] = sequence_exp; node->seq_expected[rcv_port] = sequence_exp; node->seq_start[rcv_port] = node->seq_expected[rcv_port]; spin_unlock_bh(&node->seq_out_lock); return 1; } /* update the drop window for the port where this frame was received * and clear the drop window for the other port */ node->seq_start[other_port] = node->seq_expected[other_port]; node->seq_expected[rcv_port] = sequence_exp; sequence_diff = sequence_exp - node->seq_start[rcv_port]; if (sequence_diff > PRP_DROP_WINDOW_LEN) node->seq_start[rcv_port] = sequence_exp - PRP_DROP_WINDOW_LEN; node->time_out[port->type] = jiffies; node->seq_out[port->type] = sequence_nr; spin_unlock_bh(&node->seq_out_lock); return 0; } #if IS_MODULE(CONFIG_PRP_DUP_DISCARD_KUNIT_TEST) EXPORT_SYMBOL(prp_register_frame_out); #endif static struct hsr_port *get_late_port(struct hsr_priv *hsr, struct hsr_node *node) { if (node->time_in_stale[HSR_PT_SLAVE_A]) return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A); if (node->time_in_stale[HSR_PT_SLAVE_B]) return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); if (time_after(node->time_in[HSR_PT_SLAVE_B], node->time_in[HSR_PT_SLAVE_A] + msecs_to_jiffies(MAX_SLAVE_DIFF))) return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A); if (time_after(node->time_in[HSR_PT_SLAVE_A], node->time_in[HSR_PT_SLAVE_B] + msecs_to_jiffies(MAX_SLAVE_DIFF))) return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); return NULL; } /* Remove stale sequence_nr records. Called by timer every * HSR_LIFE_CHECK_INTERVAL (two seconds or so). */ void hsr_prune_nodes(struct timer_list *t) { struct hsr_priv *hsr = timer_container_of(hsr, t, prune_timer); struct hsr_node *node; struct hsr_node *tmp; struct hsr_port *port; unsigned long timestamp; unsigned long time_a, time_b; spin_lock_bh(&hsr->list_lock); list_for_each_entry_safe(node, tmp, &hsr->node_db, mac_list) { /* Don't prune own node. Neither time_in[HSR_PT_SLAVE_A] * nor time_in[HSR_PT_SLAVE_B], will ever be updated for * the master port. Thus the master node will be repeatedly * pruned leading to packet loss. */ if (hsr_addr_is_self(hsr, node->macaddress_A)) continue; /* Shorthand */ time_a = node->time_in[HSR_PT_SLAVE_A]; time_b = node->time_in[HSR_PT_SLAVE_B]; /* Check for timestamps old enough to risk wrap-around */ if (time_after(jiffies, time_a + MAX_JIFFY_OFFSET / 2)) node->time_in_stale[HSR_PT_SLAVE_A] = true; if (time_after(jiffies, time_b + MAX_JIFFY_OFFSET / 2)) node->time_in_stale[HSR_PT_SLAVE_B] = true; /* Get age of newest frame from node. * At least one time_in is OK here; nodes get pruned long * before both time_ins can get stale */ timestamp = time_a; if (node->time_in_stale[HSR_PT_SLAVE_A] || (!node->time_in_stale[HSR_PT_SLAVE_B] && time_after(time_b, time_a))) timestamp = time_b; /* Warn of ring error only as long as we get frames at all */ if (time_is_after_jiffies(timestamp + msecs_to_jiffies(1.5 * MAX_SLAVE_DIFF))) { rcu_read_lock(); port = get_late_port(hsr, node); if (port) hsr_nl_ringerror(hsr, node->macaddress_A, port); rcu_read_unlock(); } /* Prune old entries */ if (time_is_before_jiffies(timestamp + msecs_to_jiffies(HSR_NODE_FORGET_TIME))) { hsr_nl_nodedown(hsr, node->macaddress_A); if (!node->removed) { list_del_rcu(&node->mac_list); node->removed = true; /* Note that we need to free this entry later: */ kfree_rcu(node, rcu_head); } } } spin_unlock_bh(&hsr->list_lock); /* Restart timer */ mod_timer(&hsr->prune_timer, jiffies + msecs_to_jiffies(PRUNE_PERIOD)); } void hsr_prune_proxy_nodes(struct timer_list *t) { struct hsr_priv *hsr = timer_container_of(hsr, t, prune_proxy_timer); unsigned long timestamp; struct hsr_node *node; struct hsr_node *tmp; spin_lock_bh(&hsr->list_lock); list_for_each_entry_safe(node, tmp, &hsr->proxy_node_db, mac_list) { /* Don't prune RedBox node. */ if (hsr_addr_is_redbox(hsr, node->macaddress_A)) continue; timestamp = node->time_in[HSR_PT_INTERLINK]; /* Prune old entries */ if (time_is_before_jiffies(timestamp + msecs_to_jiffies(HSR_PROXY_NODE_FORGET_TIME))) { hsr_nl_nodedown(hsr, node->macaddress_A); if (!node->removed) { list_del_rcu(&node->mac_list); node->removed = true; /* Note that we need to free this entry later: */ kfree_rcu(node, rcu_head); } } } spin_unlock_bh(&hsr->list_lock); /* Restart timer */ mod_timer(&hsr->prune_proxy_timer, jiffies + msecs_to_jiffies(PRUNE_PROXY_PERIOD)); } void *hsr_get_next_node(struct hsr_priv *hsr, void *_pos, unsigned char addr[ETH_ALEN]) { struct hsr_node *node; if (!_pos) { node = list_first_or_null_rcu(&hsr->node_db, struct hsr_node, mac_list); if (node) ether_addr_copy(addr, node->macaddress_A); return node; } node = _pos; list_for_each_entry_continue_rcu(node, &hsr->node_db, mac_list) { ether_addr_copy(addr, node->macaddress_A); return node; } return NULL; } int hsr_get_node_data(struct hsr_priv *hsr, const unsigned char *addr, unsigned char addr_b[ETH_ALEN], unsigned int *addr_b_ifindex, int *if1_age, u16 *if1_seq, int *if2_age, u16 *if2_seq) { struct hsr_node *node; struct hsr_port *port; unsigned long tdiff; node = find_node_by_addr_A(&hsr->node_db, addr); if (!node) return -ENOENT; ether_addr_copy(addr_b, node->macaddress_B); tdiff = jiffies - node->time_in[HSR_PT_SLAVE_A]; if (node->time_in_stale[HSR_PT_SLAVE_A]) *if1_age = INT_MAX; #if HZ <= MSEC_PER_SEC else if (tdiff > msecs_to_jiffies(INT_MAX)) *if1_age = INT_MAX; #endif else *if1_age = jiffies_to_msecs(tdiff); tdiff = jiffies - node->time_in[HSR_PT_SLAVE_B]; if (node->time_in_stale[HSR_PT_SLAVE_B]) *if2_age = INT_MAX; #if HZ <= MSEC_PER_SEC else if (tdiff > msecs_to_jiffies(INT_MAX)) *if2_age = INT_MAX; #endif else *if2_age = jiffies_to_msecs(tdiff); /* Present sequence numbers as if they were incoming on interface */ *if1_seq = node->seq_out[HSR_PT_SLAVE_B]; *if2_seq = node->seq_out[HSR_PT_SLAVE_A]; if (node->addr_B_port != HSR_PT_NONE) { port = hsr_port_get_hsr(hsr, node->addr_B_port); *addr_b_ifindex = port->dev->ifindex; } else { *addr_b_ifindex = -1; } return 0; }
13 2 2 2 1 1 1 1 1 1 1 1 2 2 2 2 2 7 6 7 6 7 1 6 1 1 5 4 4 6 2 2 2 2 2 2 2 2 2 2 2 1 1 1 3 3 3 3 1 3 5 5 5 5 5 5 5 5 5 5 1 1 235 239 2 234 4 239 238 234 4 1 4 1 4 10 10 9 1 9 8 5 3 2 1 1 1 1 1 1 1 1 1 1 8 11 3 3 3 3 11 219 218 30 219 219 219 219 235 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 // SPDX-License-Identifier: GPL-2.0-only /* * HID raw devices, giving access to raw HID events. * * In comparison to hiddev, this device does not process the * hid events at all (no parsing, no lookups). This lets applications * to work on raw hid events as they want to, and avoids a need to * use a transport-specific userspace libhid/libusb libraries. * * Copyright (c) 2007-2014 Jiri Kosina */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/fs.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/cdev.h> #include <linux/poll.h> #include <linux/device.h> #include <linux/major.h> #include <linux/slab.h> #include <linux/hid.h> #include <linux/mutex.h> #include <linux/sched/signal.h> #include <linux/string.h> #include <linux/hidraw.h> static int hidraw_major; static struct cdev hidraw_cdev; static const struct class hidraw_class = { .name = "hidraw", }; static struct hidraw *hidraw_table[HIDRAW_MAX_DEVICES]; static DECLARE_RWSEM(minors_rwsem); static inline bool hidraw_is_revoked(struct hidraw_list *list) { return list->revoked; } static ssize_t hidraw_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos) { struct hidraw_list *list = file->private_data; int ret = 0, len; DECLARE_WAITQUEUE(wait, current); if (hidraw_is_revoked(list)) return -ENODEV; mutex_lock(&list->read_mutex); while (ret == 0) { if (list->head == list->tail) { add_wait_queue(&list->hidraw->wait, &wait); set_current_state(TASK_INTERRUPTIBLE); while (list->head == list->tail) { if (signal_pending(current)) { ret = -ERESTARTSYS; break; } if (!list->hidraw->exist) { ret = -EIO; break; } if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; break; } /* allow O_NONBLOCK to work well from other threads */ mutex_unlock(&list->read_mutex); schedule(); mutex_lock(&list->read_mutex); set_current_state(TASK_INTERRUPTIBLE); } set_current_state(TASK_RUNNING); remove_wait_queue(&list->hidraw->wait, &wait); } if (ret) goto out; len = list->buffer[list->tail].len > count ? count : list->buffer[list->tail].len; if (list->buffer[list->tail].value) { if (copy_to_user(buffer, list->buffer[list->tail].value, len)) { ret = -EFAULT; goto out; } ret = len; } kfree(list->buffer[list->tail].value); list->buffer[list->tail].value = NULL; list->tail = (list->tail + 1) & (HIDRAW_BUFFER_SIZE - 1); } out: mutex_unlock(&list->read_mutex); return ret; } /* * The first byte of the report buffer is expected to be a report number. */ static ssize_t hidraw_send_report(struct file *file, const char __user *buffer, size_t count, unsigned char report_type) { unsigned int minor = iminor(file_inode(file)); struct hid_device *dev; __u8 *buf; int ret = 0; lockdep_assert_held(&minors_rwsem); if (!hidraw_table[minor] || !hidraw_table[minor]->exist) { ret = -ENODEV; goto out; } dev = hidraw_table[minor]->hid; if (count > HID_MAX_BUFFER_SIZE) { hid_warn(dev, "pid %d passed too large report\n", task_pid_nr(current)); ret = -EINVAL; goto out; } if (count < 2) { hid_warn(dev, "pid %d passed too short report\n", task_pid_nr(current)); ret = -EINVAL; goto out; } buf = memdup_user(buffer, count); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto out; } if ((report_type == HID_OUTPUT_REPORT) && !(dev->quirks & HID_QUIRK_NO_OUTPUT_REPORTS_ON_INTR_EP)) { ret = __hid_hw_output_report(dev, buf, count, (u64)(long)file, false); /* * compatibility with old implementation of USB-HID and I2C-HID: * if the device does not support receiving output reports, * on an interrupt endpoint, fallback to SET_REPORT HID command. */ if (ret != -ENOSYS) goto out_free; } ret = __hid_hw_raw_request(dev, buf[0], buf, count, report_type, HID_REQ_SET_REPORT, (u64)(long)file, false); out_free: kfree(buf); out: return ret; } static ssize_t hidraw_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { struct hidraw_list *list = file->private_data; ssize_t ret; down_read(&minors_rwsem); if (hidraw_is_revoked(list)) ret = -ENODEV; else ret = hidraw_send_report(file, buffer, count, HID_OUTPUT_REPORT); up_read(&minors_rwsem); return ret; } /* * This function performs a Get_Report transfer over the control endpoint * per section 7.2.1 of the HID specification, version 1.1. The first byte * of buffer is the report number to request, or 0x0 if the device does not * use numbered reports. The report_type parameter can be HID_FEATURE_REPORT * or HID_INPUT_REPORT. */ static ssize_t hidraw_get_report(struct file *file, char __user *buffer, size_t count, unsigned char report_type) { unsigned int minor = iminor(file_inode(file)); struct hid_device *dev; __u8 *buf; int ret = 0, len; unsigned char report_number; lockdep_assert_held(&minors_rwsem); if (!hidraw_table[minor] || !hidraw_table[minor]->exist) { ret = -ENODEV; goto out; } dev = hidraw_table[minor]->hid; if (!dev->ll_driver->raw_request) { ret = -ENODEV; goto out; } if (count > HID_MAX_BUFFER_SIZE) { hid_warn(dev, "pid %d passed too large report\n", task_pid_nr(current)); ret = -EINVAL; goto out; } if (count < 2) { hid_warn(dev, "pid %d passed too short report\n", task_pid_nr(current)); ret = -EINVAL; goto out; } buf = kmalloc(count, GFP_KERNEL); if (!buf) { ret = -ENOMEM; goto out; } /* * Read the first byte from the user. This is the report number, * which is passed to hid_hw_raw_request(). */ if (copy_from_user(&report_number, buffer, 1)) { ret = -EFAULT; goto out_free; } ret = __hid_hw_raw_request(dev, report_number, buf, count, report_type, HID_REQ_GET_REPORT, (u64)(long)file, false); if (ret < 0) goto out_free; len = (ret < count) ? ret : count; if (copy_to_user(buffer, buf, len)) { ret = -EFAULT; goto out_free; } ret = len; out_free: kfree(buf); out: return ret; } static __poll_t hidraw_poll(struct file *file, poll_table *wait) { struct hidraw_list *list = file->private_data; __poll_t mask = EPOLLOUT | EPOLLWRNORM; /* hidraw is always writable */ poll_wait(file, &list->hidraw->wait, wait); if (list->head != list->tail) mask |= EPOLLIN | EPOLLRDNORM; if (!list->hidraw->exist || hidraw_is_revoked(list)) mask |= EPOLLERR | EPOLLHUP; return mask; } static int hidraw_open(struct inode *inode, struct file *file) { unsigned int minor = iminor(inode); struct hidraw *dev; struct hidraw_list *list; unsigned long flags; int err = 0; if (!(list = kzalloc(sizeof(struct hidraw_list), GFP_KERNEL))) { err = -ENOMEM; goto out; } /* * Technically not writing to the hidraw_table but a write lock is * required to protect the device refcount. This is symmetrical to * hidraw_release(). */ down_write(&minors_rwsem); if (!hidraw_table[minor] || !hidraw_table[minor]->exist) { err = -ENODEV; goto out_unlock; } dev = hidraw_table[minor]; if (!dev->open++) { err = hid_hw_power(dev->hid, PM_HINT_FULLON); if (err < 0) { dev->open--; goto out_unlock; } err = hid_hw_open(dev->hid); if (err < 0) { hid_hw_power(dev->hid, PM_HINT_NORMAL); dev->open--; goto out_unlock; } } list->hidraw = hidraw_table[minor]; mutex_init(&list->read_mutex); spin_lock_irqsave(&hidraw_table[minor]->list_lock, flags); list_add_tail(&list->node, &hidraw_table[minor]->list); spin_unlock_irqrestore(&hidraw_table[minor]->list_lock, flags); file->private_data = list; out_unlock: up_write(&minors_rwsem); out: if (err < 0) kfree(list); return err; } static int hidraw_fasync(int fd, struct file *file, int on) { struct hidraw_list *list = file->private_data; if (hidraw_is_revoked(list)) return -ENODEV; return fasync_helper(fd, file, on, &list->fasync); } static void drop_ref(struct hidraw *hidraw, int exists_bit) { if (exists_bit) { hidraw->exist = 0; if (hidraw->open) { hid_hw_close(hidraw->hid); wake_up_interruptible(&hidraw->wait); } device_destroy(&hidraw_class, MKDEV(hidraw_major, hidraw->minor)); } else { --hidraw->open; } if (!hidraw->open) { if (!hidraw->exist) { hidraw_table[hidraw->minor] = NULL; kfree(hidraw); } else { /* close device for last reader */ hid_hw_close(hidraw->hid); hid_hw_power(hidraw->hid, PM_HINT_NORMAL); } } } static int hidraw_release(struct inode * inode, struct file * file) { unsigned int minor = iminor(inode); struct hidraw_list *list = file->private_data; unsigned long flags; down_write(&minors_rwsem); spin_lock_irqsave(&hidraw_table[minor]->list_lock, flags); while (list->tail != list->head) { kfree(list->buffer[list->tail].value); list->buffer[list->tail].value = NULL; list->tail = (list->tail + 1) & (HIDRAW_BUFFER_SIZE - 1); } list_del(&list->node); spin_unlock_irqrestore(&hidraw_table[minor]->list_lock, flags); kfree(list); drop_ref(hidraw_table[minor], 0); up_write(&minors_rwsem); return 0; } static int hidraw_revoke(struct hidraw_list *list) { list->revoked = true; return 0; } static long hidraw_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(file); unsigned int minor = iminor(inode); long ret = 0; struct hidraw *dev; struct hidraw_list *list = file->private_data; void __user *user_arg = (void __user*) arg; down_read(&minors_rwsem); dev = hidraw_table[minor]; if (!dev || !dev->exist || hidraw_is_revoked(list)) { ret = -ENODEV; goto out; } switch (cmd) { case HIDIOCGRDESCSIZE: if (put_user(dev->hid->rsize, (int __user *)arg)) ret = -EFAULT; break; case HIDIOCGRDESC: { __u32 len; if (get_user(len, (int __user *)arg)) ret = -EFAULT; else if (len > HID_MAX_DESCRIPTOR_SIZE - 1) ret = -EINVAL; else if (copy_to_user(user_arg + offsetof( struct hidraw_report_descriptor, value[0]), dev->hid->rdesc, min(dev->hid->rsize, len))) ret = -EFAULT; break; } case HIDIOCGRAWINFO: { struct hidraw_devinfo dinfo; dinfo.bustype = dev->hid->bus; dinfo.vendor = dev->hid->vendor; dinfo.product = dev->hid->product; if (copy_to_user(user_arg, &dinfo, sizeof(dinfo))) ret = -EFAULT; break; } case HIDIOCREVOKE: { if (user_arg) ret = -EINVAL; else ret = hidraw_revoke(list); break; } default: { struct hid_device *hid = dev->hid; if (_IOC_TYPE(cmd) != 'H') { ret = -EINVAL; break; } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCSFEATURE(0))) { int len = _IOC_SIZE(cmd); ret = hidraw_send_report(file, user_arg, len, HID_FEATURE_REPORT); break; } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGFEATURE(0))) { int len = _IOC_SIZE(cmd); ret = hidraw_get_report(file, user_arg, len, HID_FEATURE_REPORT); break; } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCSINPUT(0))) { int len = _IOC_SIZE(cmd); ret = hidraw_send_report(file, user_arg, len, HID_INPUT_REPORT); break; } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGINPUT(0))) { int len = _IOC_SIZE(cmd); ret = hidraw_get_report(file, user_arg, len, HID_INPUT_REPORT); break; } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCSOUTPUT(0))) { int len = _IOC_SIZE(cmd); ret = hidraw_send_report(file, user_arg, len, HID_OUTPUT_REPORT); break; } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGOUTPUT(0))) { int len = _IOC_SIZE(cmd); ret = hidraw_get_report(file, user_arg, len, HID_OUTPUT_REPORT); break; } /* Begin Read-only ioctls. */ if (_IOC_DIR(cmd) != _IOC_READ) { ret = -EINVAL; break; } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGRAWNAME(0))) { int len = strlen(hid->name) + 1; if (len > _IOC_SIZE(cmd)) len = _IOC_SIZE(cmd); ret = copy_to_user(user_arg, hid->name, len) ? -EFAULT : len; break; } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGRAWPHYS(0))) { int len = strlen(hid->phys) + 1; if (len > _IOC_SIZE(cmd)) len = _IOC_SIZE(cmd); ret = copy_to_user(user_arg, hid->phys, len) ? -EFAULT : len; break; } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGRAWUNIQ(0))) { int len = strlen(hid->uniq) + 1; if (len > _IOC_SIZE(cmd)) len = _IOC_SIZE(cmd); ret = copy_to_user(user_arg, hid->uniq, len) ? -EFAULT : len; break; } } ret = -ENOTTY; } out: up_read(&minors_rwsem); return ret; } static const struct file_operations hidraw_ops = { .owner = THIS_MODULE, .read = hidraw_read, .write = hidraw_write, .poll = hidraw_poll, .open = hidraw_open, .release = hidraw_release, .unlocked_ioctl = hidraw_ioctl, .fasync = hidraw_fasync, .compat_ioctl = compat_ptr_ioctl, .llseek = noop_llseek, }; int hidraw_report_event(struct hid_device *hid, u8 *data, int len) { struct hidraw *dev = hid->hidraw; struct hidraw_list *list; int ret = 0; unsigned long flags; spin_lock_irqsave(&dev->list_lock, flags); list_for_each_entry(list, &dev->list, node) { int new_head = (list->head + 1) & (HIDRAW_BUFFER_SIZE - 1); if (hidraw_is_revoked(list) || new_head == list->tail) continue; if (!(list->buffer[list->head].value = kmemdup(data, len, GFP_ATOMIC))) { ret = -ENOMEM; break; } list->buffer[list->head].len = len; list->head = new_head; kill_fasync(&list->fasync, SIGIO, POLL_IN); } spin_unlock_irqrestore(&dev->list_lock, flags); wake_up_interruptible(&dev->wait); return ret; } EXPORT_SYMBOL_GPL(hidraw_report_event); int hidraw_connect(struct hid_device *hid) { int minor, result; struct hidraw *dev; /* we accept any HID device, all applications */ dev = kzalloc(sizeof(struct hidraw), GFP_KERNEL); if (!dev) return -ENOMEM; result = -EINVAL; down_write(&minors_rwsem); for (minor = 0; minor < HIDRAW_MAX_DEVICES; minor++) { if (hidraw_table[minor]) continue; hidraw_table[minor] = dev; result = 0; break; } if (result) { up_write(&minors_rwsem); kfree(dev); goto out; } dev->dev = device_create(&hidraw_class, &hid->dev, MKDEV(hidraw_major, minor), NULL, "%s%d", "hidraw", minor); if (IS_ERR(dev->dev)) { hidraw_table[minor] = NULL; up_write(&minors_rwsem); result = PTR_ERR(dev->dev); kfree(dev); goto out; } init_waitqueue_head(&dev->wait); spin_lock_init(&dev->list_lock); INIT_LIST_HEAD(&dev->list); dev->hid = hid; dev->minor = minor; dev->exist = 1; hid->hidraw = dev; up_write(&minors_rwsem); out: return result; } EXPORT_SYMBOL_GPL(hidraw_connect); void hidraw_disconnect(struct hid_device *hid) { struct hidraw *hidraw = hid->hidraw; down_write(&minors_rwsem); drop_ref(hidraw, 1); up_write(&minors_rwsem); } EXPORT_SYMBOL_GPL(hidraw_disconnect); int __init hidraw_init(void) { int result; dev_t dev_id; result = alloc_chrdev_region(&dev_id, HIDRAW_FIRST_MINOR, HIDRAW_MAX_DEVICES, "hidraw"); if (result < 0) { pr_warn("can't get major number\n"); goto out; } hidraw_major = MAJOR(dev_id); result = class_register(&hidraw_class); if (result) goto error_cdev; cdev_init(&hidraw_cdev, &hidraw_ops); result = cdev_add(&hidraw_cdev, dev_id, HIDRAW_MAX_DEVICES); if (result < 0) goto error_class; pr_info("raw HID events driver (C) Jiri Kosina\n"); out: return result; error_class: class_unregister(&hidraw_class); error_cdev: unregister_chrdev_region(dev_id, HIDRAW_MAX_DEVICES); goto out; } void hidraw_exit(void) { dev_t dev_id = MKDEV(hidraw_major, 0); cdev_del(&hidraw_cdev); class_unregister(&hidraw_class); unregister_chrdev_region(dev_id, HIDRAW_MAX_DEVICES); }
61 39 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 // SPDX-License-Identifier: GPL-2.0-or-later /* Sysfs attributes of bond slaves * * Copyright (c) 2014 Scott Feldman <sfeldma@cumulusnetworks.com> */ #include <linux/capability.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <net/bonding.h> struct slave_attribute { struct attribute attr; ssize_t (*show)(struct slave *, char *); }; #define SLAVE_ATTR_RO(_name) \ const struct slave_attribute slave_attr_##_name = __ATTR_RO(_name) static ssize_t state_show(struct slave *slave, char *buf) { switch (bond_slave_state(slave)) { case BOND_STATE_ACTIVE: return sysfs_emit(buf, "active\n"); case BOND_STATE_BACKUP: return sysfs_emit(buf, "backup\n"); default: return sysfs_emit(buf, "UNKNOWN\n"); } } static SLAVE_ATTR_RO(state); static ssize_t mii_status_show(struct slave *slave, char *buf) { return sysfs_emit(buf, "%s\n", bond_slave_link_status(slave->link)); } static SLAVE_ATTR_RO(mii_status); static ssize_t link_failure_count_show(struct slave *slave, char *buf) { return sysfs_emit(buf, "%d\n", slave->link_failure_count); } static SLAVE_ATTR_RO(link_failure_count); static ssize_t perm_hwaddr_show(struct slave *slave, char *buf) { return sysfs_emit(buf, "%*phC\n", slave->dev->addr_len, slave->perm_hwaddr); } static SLAVE_ATTR_RO(perm_hwaddr); static ssize_t queue_id_show(struct slave *slave, char *buf) { return sysfs_emit(buf, "%d\n", READ_ONCE(slave->queue_id)); } static SLAVE_ATTR_RO(queue_id); static ssize_t ad_aggregator_id_show(struct slave *slave, char *buf) { const struct aggregator *agg; if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) { agg = SLAVE_AD_INFO(slave)->port.aggregator; if (agg) return sysfs_emit(buf, "%d\n", agg->aggregator_identifier); } return sysfs_emit(buf, "N/A\n"); } static SLAVE_ATTR_RO(ad_aggregator_id); static ssize_t ad_actor_oper_port_state_show(struct slave *slave, char *buf) { const struct port *ad_port; if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) { ad_port = &SLAVE_AD_INFO(slave)->port; if (ad_port->aggregator) return sysfs_emit(buf, "%u\n", ad_port->actor_oper_port_state); } return sysfs_emit(buf, "N/A\n"); } static SLAVE_ATTR_RO(ad_actor_oper_port_state); static ssize_t ad_partner_oper_port_state_show(struct slave *slave, char *buf) { const struct port *ad_port; if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) { ad_port = &SLAVE_AD_INFO(slave)->port; if (ad_port->aggregator) return sysfs_emit(buf, "%u\n", ad_port->partner_oper.port_state); } return sysfs_emit(buf, "N/A\n"); } static SLAVE_ATTR_RO(ad_partner_oper_port_state); static const struct attribute *slave_attrs[] = { &slave_attr_state.attr, &slave_attr_mii_status.attr, &slave_attr_link_failure_count.attr, &slave_attr_perm_hwaddr.attr, &slave_attr_queue_id.attr, &slave_attr_ad_aggregator_id.attr, &slave_attr_ad_actor_oper_port_state.attr, &slave_attr_ad_partner_oper_port_state.attr, NULL }; #define to_slave_attr(_at) container_of(_at, struct slave_attribute, attr) static ssize_t slave_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct slave_attribute *slave_attr = to_slave_attr(attr); struct slave *slave = to_slave(kobj); return slave_attr->show(slave, buf); } const struct sysfs_ops slave_sysfs_ops = { .show = slave_show, }; int bond_sysfs_slave_add(struct slave *slave) { return sysfs_create_files(&slave->kobj, slave_attrs); } void bond_sysfs_slave_del(struct slave *slave) { sysfs_remove_files(&slave->kobj, slave_attrs); }
64 2510 2503 2510 2516 2265 2258 712 711 360 360 883 884 62 62 909 910 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 // SPDX-License-Identifier: GPL-2.0 /* * Out-of-line refcount functions. */ #include <linux/mutex.h> #include <linux/refcount.h> #include <linux/spinlock.h> #include <linux/bug.h> #define REFCOUNT_WARN(str) WARN_ONCE(1, "refcount_t: " str ".\n") void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t) { refcount_set(r, REFCOUNT_SATURATED); switch (t) { case REFCOUNT_ADD_NOT_ZERO_OVF: REFCOUNT_WARN("saturated; leaking memory"); break; case REFCOUNT_ADD_OVF: REFCOUNT_WARN("saturated; leaking memory"); break; case REFCOUNT_ADD_UAF: REFCOUNT_WARN("addition on 0; use-after-free"); break; case REFCOUNT_SUB_UAF: REFCOUNT_WARN("underflow; use-after-free"); break; case REFCOUNT_DEC_LEAK: REFCOUNT_WARN("decrement hit 0; leaking memory"); break; default: REFCOUNT_WARN("unknown saturation event!?"); } } EXPORT_SYMBOL(refcount_warn_saturate); /** * refcount_dec_if_one - decrement a refcount if it is 1 * @r: the refcount * * No atomic_t counterpart, it attempts a 1 -> 0 transition and returns the * success thereof. * * Like all decrement operations, it provides release memory order and provides * a control dependency. * * It can be used like a try-delete operator; this explicit case is provided * and not cmpxchg in generic, because that would allow implementing unsafe * operations. * * Return: true if the resulting refcount is 0, false otherwise */ bool refcount_dec_if_one(refcount_t *r) { int val = 1; return atomic_try_cmpxchg_release(&r->refs, &val, 0); } EXPORT_SYMBOL(refcount_dec_if_one); /** * refcount_dec_not_one - decrement a refcount if it is not 1 * @r: the refcount * * No atomic_t counterpart, it decrements unless the value is 1, in which case * it will return false. * * Was often done like: atomic_add_unless(&var, -1, 1) * * Return: true if the decrement operation was successful, false otherwise */ bool refcount_dec_not_one(refcount_t *r) { unsigned int new, val = atomic_read(&r->refs); do { if (unlikely(val == REFCOUNT_SATURATED)) return true; if (val == 1) return false; new = val - 1; if (new > val) { WARN_ONCE(new > val, "refcount_t: underflow; use-after-free.\n"); return true; } } while (!atomic_try_cmpxchg_release(&r->refs, &val, new)); return true; } EXPORT_SYMBOL(refcount_dec_not_one); /** * refcount_dec_and_mutex_lock - return holding mutex if able to decrement * refcount to 0 * @r: the refcount * @lock: the mutex to be locked * * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail * to decrement when saturated at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides a control dependency such that free() must come after. * See the comment on top. * * Return: true and hold mutex if able to decrement refcount to 0, false * otherwise */ bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock) { if (refcount_dec_not_one(r)) return false; mutex_lock(lock); if (!refcount_dec_and_test(r)) { mutex_unlock(lock); return false; } return true; } EXPORT_SYMBOL(refcount_dec_and_mutex_lock); /** * refcount_dec_and_lock - return holding spinlock if able to decrement * refcount to 0 * @r: the refcount * @lock: the spinlock to be locked * * Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to * decrement when saturated at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides a control dependency such that free() must come after. * See the comment on top. * * Return: true and hold spinlock if able to decrement refcount to 0, false * otherwise */ bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock) { if (refcount_dec_not_one(r)) return false; spin_lock(lock); if (!refcount_dec_and_test(r)) { spin_unlock(lock); return false; } return true; } EXPORT_SYMBOL(refcount_dec_and_lock); /** * refcount_dec_and_lock_irqsave - return holding spinlock with disabled * interrupts if able to decrement refcount to 0 * @r: the refcount * @lock: the spinlock to be locked * @flags: saved IRQ-flags if the is acquired * * Same as refcount_dec_and_lock() above except that the spinlock is acquired * with disabled interrupts. * * Return: true and hold spinlock if able to decrement refcount to 0, false * otherwise */ bool refcount_dec_and_lock_irqsave(refcount_t *r, spinlock_t *lock, unsigned long *flags) { if (refcount_dec_not_one(r)) return false; spin_lock_irqsave(lock, *flags); if (!refcount_dec_and_test(r)) { spin_unlock_irqrestore(lock, *flags); return false; } return true; } EXPORT_SYMBOL(refcount_dec_and_lock_irqsave);
13 13 13 13 13 13 13 13 13 13 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 13 13 13 13 10 10 10 10 13 13 13 13 9 9 9 9 13 10 10 10 6 6 6 6 6 6 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 4 4 4 4 10 10 10 10 6 6 6 6 6 6 6 13 6 6 6 6 6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) International Business Machines Corp., 2006 * Copyright (c) Nokia Corporation, 2006, 2007 * * Author: Artem Bityutskiy (Битюцкий Артём) */ /* * UBI input/output sub-system. * * This sub-system provides a uniform way to work with all kinds of the * underlying MTD devices. It also implements handy functions for reading and * writing UBI headers. * * We are trying to have a paranoid mindset and not to trust to what we read * from the flash media in order to be more secure and robust. So this * sub-system validates every single header it reads from the flash media. * * Some words about how the eraseblock headers are stored. * * The erase counter header is always stored at offset zero. By default, the * VID header is stored after the EC header at the closest aligned offset * (i.e. aligned to the minimum I/O unit size). Data starts next to the VID * header at the closest aligned offset. But this default layout may be * changed. For example, for different reasons (e.g., optimization) UBI may be * asked to put the VID header at further offset, and even at an unaligned * offset. Of course, if the offset of the VID header is unaligned, UBI adds * proper padding in front of it. Data offset may also be changed but it has to * be aligned. * * About minimal I/O units. In general, UBI assumes flash device model where * there is only one minimal I/O unit size. E.g., in case of NOR flash it is 1, * in case of NAND flash it is a NAND page, etc. This is reported by MTD in the * @ubi->mtd->writesize field. But as an exception, UBI admits use of another * (smaller) minimal I/O unit size for EC and VID headers to make it possible * to do different optimizations. * * This is extremely useful in case of NAND flashes which admit of several * write operations to one NAND page. In this case UBI can fit EC and VID * headers at one NAND page. Thus, UBI may use "sub-page" size as the minimal * I/O unit for the headers (the @ubi->hdrs_min_io_size field). But it still * reports NAND page size (@ubi->min_io_size) as a minimal I/O unit for the UBI * users. * * Example: some Samsung NANDs with 2KiB pages allow 4x 512-byte writes, so * although the minimal I/O unit is 2K, UBI uses 512 bytes for EC and VID * headers. * * Q: why not just to treat sub-page as a minimal I/O unit of this flash * device, e.g., make @ubi->min_io_size = 512 in the example above? * * A: because when writing a sub-page, MTD still writes a full 2K page but the * bytes which are not relevant to the sub-page are 0xFF. So, basically, * writing 4x512 sub-pages is 4 times slower than writing one 2KiB NAND page. * Thus, we prefer to use sub-pages only for EC and VID headers. * * As it was noted above, the VID header may start at a non-aligned offset. * For example, in case of a 2KiB page NAND flash with a 512 bytes sub-page, * the VID header may reside at offset 1984 which is the last 64 bytes of the * last sub-page (EC header is always at offset zero). This causes some * difficulties when reading and writing VID headers. * * Suppose we have a 64-byte buffer and we read a VID header at it. We change * the data and want to write this VID header out. As we can only write in * 512-byte chunks, we have to allocate one more buffer and copy our VID header * to offset 448 of this buffer. * * The I/O sub-system does the following trick in order to avoid this extra * copy. It always allocates a @ubi->vid_hdr_alsize bytes buffer for the VID * header and returns a pointer to offset @ubi->vid_hdr_shift of this buffer. * When the VID header is being written out, it shifts the VID header pointer * back and writes the whole sub-page. */ #include <linux/crc32.h> #include <linux/err.h> #include <linux/slab.h> #include "ubi.h" static int self_check_not_bad(const struct ubi_device *ubi, int pnum); static int self_check_peb_ec_hdr(const struct ubi_device *ubi, int pnum); static int self_check_ec_hdr(const struct ubi_device *ubi, int pnum, const struct ubi_ec_hdr *ec_hdr); static int self_check_peb_vid_hdr(const struct ubi_device *ubi, int pnum); static int self_check_vid_hdr(const struct ubi_device *ubi, int pnum, const struct ubi_vid_hdr *vid_hdr); static int self_check_write(struct ubi_device *ubi, const void *buf, int pnum, int offset, int len); /** * ubi_io_read - read data from a physical eraseblock. * @ubi: UBI device description object * @buf: buffer where to store the read data * @pnum: physical eraseblock number to read from * @offset: offset within the physical eraseblock from where to read * @len: how many bytes to read * * This function reads data from offset @offset of physical eraseblock @pnum * and stores the read data in the @buf buffer. The following return codes are * possible: * * o %0 if all the requested data were successfully read; * o %UBI_IO_BITFLIPS if all the requested data were successfully read, but * correctable bit-flips were detected; this is harmless but may indicate * that this eraseblock may become bad soon (but do not have to); * o %-EBADMSG if the MTD subsystem reported about data integrity problems, for * example it can be an ECC error in case of NAND; this most probably means * that the data is corrupted; * o %-EIO if some I/O error occurred; * o other negative error codes in case of other errors. */ int ubi_io_read(const struct ubi_device *ubi, void *buf, int pnum, int offset, int len) { int err, retries = 0; size_t read; loff_t addr; dbg_io("read %d bytes from PEB %d:%d", len, pnum, offset); ubi_assert(pnum >= 0 && pnum < ubi->peb_count); ubi_assert(offset >= 0 && offset + len <= ubi->peb_size); ubi_assert(len > 0); err = self_check_not_bad(ubi, pnum); if (err) return err; /* * Deliberately corrupt the buffer to improve robustness. Indeed, if we * do not do this, the following may happen: * 1. The buffer contains data from previous operation, e.g., read from * another PEB previously. The data looks like expected, e.g., if we * just do not read anything and return - the caller would not * notice this. E.g., if we are reading a VID header, the buffer may * contain a valid VID header from another PEB. * 2. The driver is buggy and returns us success or -EBADMSG or * -EUCLEAN, but it does not actually put any data to the buffer. * * This may confuse UBI or upper layers - they may think the buffer * contains valid data while in fact it is just old data. This is * especially possible because UBI (and UBIFS) relies on CRC, and * treats data as correct even in case of ECC errors if the CRC is * correct. * * Try to prevent this situation by changing the first byte of the * buffer. */ *((uint8_t *)buf) ^= 0xFF; addr = (loff_t)pnum * ubi->peb_size + offset; retry: err = mtd_read(ubi->mtd, addr, len, &read, buf); if (err) { const char *errstr = mtd_is_eccerr(err) ? " (ECC error)" : ""; if (mtd_is_bitflip(err)) { /* * -EUCLEAN is reported if there was a bit-flip which * was corrected, so this is harmless. * * We do not report about it here unless debugging is * enabled. A corresponding message will be printed * later, when it is has been scrubbed. */ ubi_msg(ubi, "fixable bit-flip detected at PEB %d", pnum); ubi_assert(len == read); return UBI_IO_BITFLIPS; } if (retries++ < UBI_IO_RETRIES) { ubi_warn(ubi, "error %d%s while reading %d bytes from PEB %d:%d, read only %zd bytes, retry", err, errstr, len, pnum, offset, read); yield(); goto retry; } ubi_err(ubi, "error %d%s while reading %d bytes from PEB %d:%d, read %zd bytes", err, errstr, len, pnum, offset, read); dump_stack(); /* * The driver should never return -EBADMSG if it failed to read * all the requested data. But some buggy drivers might do * this, so we change it to -EIO. */ if (read != len && mtd_is_eccerr(err)) { ubi_assert(0); err = -EIO; } } else { ubi_assert(len == read); if (ubi_dbg_is_bitflip(ubi)) { dbg_gen("bit-flip (emulated)"); return UBI_IO_BITFLIPS; } if (ubi_dbg_is_read_failure(ubi, MASK_READ_FAILURE)) { ubi_warn(ubi, "cannot read %d bytes from PEB %d:%d (emulated)", len, pnum, offset); return -EIO; } if (ubi_dbg_is_eccerr(ubi)) { ubi_warn(ubi, "ECC error (emulated) while reading %d bytes from PEB %d:%d, read %zd bytes", len, pnum, offset, read); return -EBADMSG; } } return err; } /** * ubi_io_write - write data to a physical eraseblock. * @ubi: UBI device description object * @buf: buffer with the data to write * @pnum: physical eraseblock number to write to * @offset: offset within the physical eraseblock where to write * @len: how many bytes to write * * This function writes @len bytes of data from buffer @buf to offset @offset * of physical eraseblock @pnum. If all the data were successfully written, * zero is returned. If an error occurred, this function returns a negative * error code. If %-EIO is returned, the physical eraseblock most probably went * bad. * * Note, in case of an error, it is possible that something was still written * to the flash media, but may be some garbage. */ int ubi_io_write(struct ubi_device *ubi, const void *buf, int pnum, int offset, int len) { int err; size_t written; loff_t addr; dbg_io("write %d bytes to PEB %d:%d", len, pnum, offset); ubi_assert(pnum >= 0 && pnum < ubi->peb_count); ubi_assert(offset >= 0 && offset + len <= ubi->peb_size); ubi_assert(offset % ubi->hdrs_min_io_size == 0); ubi_assert(len > 0 && len % ubi->hdrs_min_io_size == 0); if (ubi->ro_mode) { ubi_err(ubi, "read-only mode"); return -EROFS; } err = self_check_not_bad(ubi, pnum); if (err) return err; /* The area we are writing to has to contain all 0xFF bytes */ err = ubi_self_check_all_ff(ubi, pnum, offset, len); if (err) return err; if (offset >= ubi->leb_start) { /* * We write to the data area of the physical eraseblock. Make * sure it has valid EC and VID headers. */ err = self_check_peb_ec_hdr(ubi, pnum); if (err) return err; err = self_check_peb_vid_hdr(ubi, pnum); if (err) return err; } if (ubi_dbg_is_write_failure(ubi)) { ubi_err(ubi, "cannot write %d bytes to PEB %d:%d (emulated)", len, pnum, offset); dump_stack(); return -EIO; } addr = (loff_t)pnum * ubi->peb_size + offset; err = mtd_write(ubi->mtd, addr, len, &written, buf); if (err) { ubi_err(ubi, "error %d while writing %d bytes to PEB %d:%d, written %zd bytes", err, len, pnum, offset, written); dump_stack(); ubi_dump_flash(ubi, pnum, offset, len); } else ubi_assert(written == len); if (!err) { err = self_check_write(ubi, buf, pnum, offset, len); if (err) return err; /* * Since we always write sequentially, the rest of the PEB has * to contain only 0xFF bytes. */ offset += len; len = ubi->peb_size - offset; if (len) err = ubi_self_check_all_ff(ubi, pnum, offset, len); } return err; } /** * do_sync_erase - synchronously erase a physical eraseblock. * @ubi: UBI device description object * @pnum: the physical eraseblock number to erase * * This function synchronously erases physical eraseblock @pnum and returns * zero in case of success and a negative error code in case of failure. If * %-EIO is returned, the physical eraseblock most probably went bad. */ static int do_sync_erase(struct ubi_device *ubi, int pnum) { int err, retries = 0; struct erase_info ei; dbg_io("erase PEB %d", pnum); ubi_assert(pnum >= 0 && pnum < ubi->peb_count); if (ubi->ro_mode) { ubi_err(ubi, "read-only mode"); return -EROFS; } retry: memset(&ei, 0, sizeof(struct erase_info)); ei.addr = (loff_t)pnum * ubi->peb_size; ei.len = ubi->peb_size; err = mtd_erase(ubi->mtd, &ei); if (err) { if (retries++ < UBI_IO_RETRIES) { ubi_warn(ubi, "error %d while erasing PEB %d, retry", err, pnum); yield(); goto retry; } ubi_err(ubi, "cannot erase PEB %d, error %d", pnum, err); dump_stack(); return err; } err = ubi_self_check_all_ff(ubi, pnum, 0, ubi->peb_size); if (err) return err; if (ubi_dbg_is_erase_failure(ubi)) { ubi_err(ubi, "cannot erase PEB %d (emulated)", pnum); return -EIO; } return 0; } /* Patterns to write to a physical eraseblock when torturing it */ static uint8_t patterns[] = {0xa5, 0x5a, 0x0}; /** * torture_peb - test a supposedly bad physical eraseblock. * @ubi: UBI device description object * @pnum: the physical eraseblock number to test * * This function returns %-EIO if the physical eraseblock did not pass the * test, a positive number of erase operations done if the test was * successfully passed, and other negative error codes in case of other errors. */ static int torture_peb(struct ubi_device *ubi, int pnum) { int err, i, patt_count; ubi_msg(ubi, "run torture test for PEB %d", pnum); patt_count = ARRAY_SIZE(patterns); ubi_assert(patt_count > 0); mutex_lock(&ubi->buf_mutex); for (i = 0; i < patt_count; i++) { err = do_sync_erase(ubi, pnum); if (err) goto out; /* Make sure the PEB contains only 0xFF bytes */ err = ubi_io_read(ubi, ubi->peb_buf, pnum, 0, ubi->peb_size); if (err) goto out; err = ubi_check_pattern(ubi->peb_buf, 0xFF, ubi->peb_size); if (err == 0) { ubi_err(ubi, "erased PEB %d, but a non-0xFF byte found", pnum); err = -EIO; goto out; } /* Write a pattern and check it */ memset(ubi->peb_buf, patterns[i], ubi->peb_size); err = ubi_io_write(ubi, ubi->peb_buf, pnum, 0, ubi->peb_size); if (err) goto out; memset(ubi->peb_buf, ~patterns[i], ubi->peb_size); err = ubi_io_read(ubi, ubi->peb_buf, pnum, 0, ubi->peb_size); if (err) goto out; err = ubi_check_pattern(ubi->peb_buf, patterns[i], ubi->peb_size); if (err == 0) { ubi_err(ubi, "pattern %x checking failed for PEB %d", patterns[i], pnum); err = -EIO; goto out; } } err = patt_count; ubi_msg(ubi, "PEB %d passed torture test, do not mark it as bad", pnum); out: mutex_unlock(&ubi->buf_mutex); if (err == UBI_IO_BITFLIPS || mtd_is_eccerr(err)) { /* * If a bit-flip or data integrity error was detected, the test * has not passed because it happened on a freshly erased * physical eraseblock which means something is wrong with it. */ ubi_err(ubi, "read problems on freshly erased PEB %d, must be bad", pnum); err = -EIO; } return err; } /** * nor_erase_prepare - prepare a NOR flash PEB for erasure. * @ubi: UBI device description object * @pnum: physical eraseblock number to prepare * * NOR flash, or at least some of them, have peculiar embedded PEB erasure * algorithm: the PEB is first filled with zeroes, then it is erased. And * filling with zeroes starts from the end of the PEB. This was observed with * Spansion S29GL512N NOR flash. * * This means that in case of a power cut we may end up with intact data at the * beginning of the PEB, and all zeroes at the end of PEB. In other words, the * EC and VID headers are OK, but a large chunk of data at the end of PEB is * zeroed. This makes UBI mistakenly treat this PEB as used and associate it * with an LEB, which leads to subsequent failures (e.g., UBIFS fails). * * This function is called before erasing NOR PEBs and it zeroes out EC and VID * magic numbers in order to invalidate them and prevent the failures. Returns * zero in case of success and a negative error code in case of failure. */ static int nor_erase_prepare(struct ubi_device *ubi, int pnum) { int err; size_t written; loff_t addr; uint32_t data = 0; struct ubi_ec_hdr ec_hdr; struct ubi_vid_io_buf vidb; /* * Note, we cannot generally define VID header buffers on stack, * because of the way we deal with these buffers (see the header * comment in this file). But we know this is a NOR-specific piece of * code, so we can do this. But yes, this is error-prone and we should * (pre-)allocate VID header buffer instead. */ struct ubi_vid_hdr vid_hdr; /* * If VID or EC is valid, we have to corrupt them before erasing. * It is important to first invalidate the EC header, and then the VID * header. Otherwise a power cut may lead to valid EC header and * invalid VID header, in which case UBI will treat this PEB as * corrupted and will try to preserve it, and print scary warnings. */ addr = (loff_t)pnum * ubi->peb_size; err = ubi_io_read_ec_hdr(ubi, pnum, &ec_hdr, 0); if (err != UBI_IO_BAD_HDR_EBADMSG && err != UBI_IO_BAD_HDR && err != UBI_IO_FF){ err = mtd_write(ubi->mtd, addr, 4, &written, (void *)&data); if(err) goto error; } ubi_init_vid_buf(ubi, &vidb, &vid_hdr); ubi_assert(&vid_hdr == ubi_get_vid_hdr(&vidb)); err = ubi_io_read_vid_hdr(ubi, pnum, &vidb, 0); if (err != UBI_IO_BAD_HDR_EBADMSG && err != UBI_IO_BAD_HDR && err != UBI_IO_FF){ addr += ubi->vid_hdr_aloffset; err = mtd_write(ubi->mtd, addr, 4, &written, (void *)&data); if (err) goto error; } return 0; error: /* * The PEB contains a valid VID or EC header, but we cannot invalidate * it. Supposedly the flash media or the driver is screwed up, so * return an error. */ ubi_err(ubi, "cannot invalidate PEB %d, write returned %d", pnum, err); ubi_dump_flash(ubi, pnum, 0, ubi->peb_size); return -EIO; } /** * ubi_io_sync_erase - synchronously erase a physical eraseblock. * @ubi: UBI device description object * @pnum: physical eraseblock number to erase * @torture: if this physical eraseblock has to be tortured * * This function synchronously erases physical eraseblock @pnum. If @torture * flag is not zero, the physical eraseblock is checked by means of writing * different patterns to it and reading them back. If the torturing is enabled, * the physical eraseblock is erased more than once. * * This function returns the number of erasures made in case of success, %-EIO * if the erasure failed or the torturing test failed, and other negative error * codes in case of other errors. Note, %-EIO means that the physical * eraseblock is bad. */ int ubi_io_sync_erase(struct ubi_device *ubi, int pnum, int torture) { int err, ret = 0; ubi_assert(pnum >= 0 && pnum < ubi->peb_count); err = self_check_not_bad(ubi, pnum); if (err != 0) return err; if (ubi->ro_mode) { ubi_err(ubi, "read-only mode"); return -EROFS; } /* * If the flash is ECC-ed then we have to erase the ECC block before we * can write to it. But the write is in preparation to an erase in the * first place. This means we cannot zero out EC and VID before the * erase and we just have to hope the flash starts erasing from the * start of the page. */ if (ubi->nor_flash && ubi->mtd->writesize == 1) { err = nor_erase_prepare(ubi, pnum); if (err) return err; } if (torture) { ret = torture_peb(ubi, pnum); if (ret < 0) return ret; } err = do_sync_erase(ubi, pnum); if (err) return err; return ret + 1; } /** * ubi_io_is_bad - check if a physical eraseblock is bad. * @ubi: UBI device description object * @pnum: the physical eraseblock number to check * * This function returns a positive number if the physical eraseblock is bad, * zero if not, and a negative error code if an error occurred. */ int ubi_io_is_bad(const struct ubi_device *ubi, int pnum) { struct mtd_info *mtd = ubi->mtd; ubi_assert(pnum >= 0 && pnum < ubi->peb_count); if (ubi->bad_allowed) { int ret; ret = mtd_block_isbad(mtd, (loff_t)pnum * ubi->peb_size); if (ret < 0) ubi_err(ubi, "error %d while checking if PEB %d is bad", ret, pnum); else if (ret) dbg_io("PEB %d is bad", pnum); return ret; } return 0; } /** * ubi_io_mark_bad - mark a physical eraseblock as bad. * @ubi: UBI device description object * @pnum: the physical eraseblock number to mark * * This function returns zero in case of success and a negative error code in * case of failure. */ int ubi_io_mark_bad(const struct ubi_device *ubi, int pnum) { int err; struct mtd_info *mtd = ubi->mtd; ubi_assert(pnum >= 0 && pnum < ubi->peb_count); if (ubi->ro_mode) { ubi_err(ubi, "read-only mode"); return -EROFS; } if (!ubi->bad_allowed) return 0; err = mtd_block_markbad(mtd, (loff_t)pnum * ubi->peb_size); if (err) ubi_err(ubi, "cannot mark PEB %d bad, error %d", pnum, err); return err; } /** * validate_ec_hdr - validate an erase counter header. * @ubi: UBI device description object * @ec_hdr: the erase counter header to check * * This function returns zero if the erase counter header is OK, and %1 if * not. */ static int validate_ec_hdr(const struct ubi_device *ubi, const struct ubi_ec_hdr *ec_hdr) { long long ec; int vid_hdr_offset, leb_start; ec = be64_to_cpu(ec_hdr->ec); vid_hdr_offset = be32_to_cpu(ec_hdr->vid_hdr_offset); leb_start = be32_to_cpu(ec_hdr->data_offset); if (ec_hdr->version != UBI_VERSION) { ubi_err(ubi, "node with incompatible UBI version found: this UBI version is %d, image version is %d", UBI_VERSION, (int)ec_hdr->version); goto bad; } if (vid_hdr_offset != ubi->vid_hdr_offset) { ubi_err(ubi, "bad VID header offset %d, expected %d", vid_hdr_offset, ubi->vid_hdr_offset); goto bad; } if (leb_start != ubi->leb_start) { ubi_err(ubi, "bad data offset %d, expected %d", leb_start, ubi->leb_start); goto bad; } if (ec < 0 || ec > UBI_MAX_ERASECOUNTER) { ubi_err(ubi, "bad erase counter %lld", ec); goto bad; } return 0; bad: ubi_err(ubi, "bad EC header"); ubi_dump_ec_hdr(ec_hdr); dump_stack(); return 1; } /** * ubi_io_read_ec_hdr - read and check an erase counter header. * @ubi: UBI device description object * @pnum: physical eraseblock to read from * @ec_hdr: a &struct ubi_ec_hdr object where to store the read erase counter * header * @verbose: be verbose if the header is corrupted or was not found * * This function reads erase counter header from physical eraseblock @pnum and * stores it in @ec_hdr. This function also checks CRC checksum of the read * erase counter header. The following codes may be returned: * * o %0 if the CRC checksum is correct and the header was successfully read; * o %UBI_IO_BITFLIPS if the CRC is correct, but bit-flips were detected * and corrected by the flash driver; this is harmless but may indicate that * this eraseblock may become bad soon (but may be not); * o %UBI_IO_BAD_HDR if the erase counter header is corrupted (a CRC error); * o %UBI_IO_BAD_HDR_EBADMSG is the same as %UBI_IO_BAD_HDR, but there also was * a data integrity error (uncorrectable ECC error in case of NAND); * o %UBI_IO_FF if only 0xFF bytes were read (the PEB is supposedly empty) * o a negative error code in case of failure. */ int ubi_io_read_ec_hdr(struct ubi_device *ubi, int pnum, struct ubi_ec_hdr *ec_hdr, int verbose) { int err, read_err; uint32_t crc, magic, hdr_crc; dbg_io("read EC header from PEB %d", pnum); ubi_assert(pnum >= 0 && pnum < ubi->peb_count); read_err = ubi_io_read(ubi, ec_hdr, pnum, 0, UBI_EC_HDR_SIZE); if (read_err) { if (read_err != UBI_IO_BITFLIPS && !mtd_is_eccerr(read_err)) return read_err; /* * We read all the data, but either a correctable bit-flip * occurred, or MTD reported a data integrity error * (uncorrectable ECC error in case of NAND). The former is * harmless, the later may mean that the read data is * corrupted. But we have a CRC check-sum and we will detect * this. If the EC header is still OK, we just report this as * there was a bit-flip, to force scrubbing. */ } magic = be32_to_cpu(ec_hdr->magic); if (magic != UBI_EC_HDR_MAGIC) { if (mtd_is_eccerr(read_err)) return UBI_IO_BAD_HDR_EBADMSG; /* * The magic field is wrong. Let's check if we have read all * 0xFF. If yes, this physical eraseblock is assumed to be * empty. */ if (ubi_check_pattern(ec_hdr, 0xFF, UBI_EC_HDR_SIZE)) { /* The physical eraseblock is supposedly empty */ if (verbose) ubi_warn(ubi, "no EC header found at PEB %d, only 0xFF bytes", pnum); dbg_bld("no EC header found at PEB %d, only 0xFF bytes", pnum); if (!read_err) return UBI_IO_FF; else return UBI_IO_FF_BITFLIPS; } /* * This is not a valid erase counter header, and these are not * 0xFF bytes. Report that the header is corrupted. */ if (verbose) { ubi_warn(ubi, "bad magic number at PEB %d: %08x instead of %08x", pnum, magic, UBI_EC_HDR_MAGIC); ubi_dump_ec_hdr(ec_hdr); } dbg_bld("bad magic number at PEB %d: %08x instead of %08x", pnum, magic, UBI_EC_HDR_MAGIC); return UBI_IO_BAD_HDR; } crc = crc32(UBI_CRC32_INIT, ec_hdr, UBI_EC_HDR_SIZE_CRC); hdr_crc = be32_to_cpu(ec_hdr->hdr_crc); if (hdr_crc != crc) { if (verbose) { ubi_warn(ubi, "bad EC header CRC at PEB %d, calculated %#08x, read %#08x", pnum, crc, hdr_crc); ubi_dump_ec_hdr(ec_hdr); } dbg_bld("bad EC header CRC at PEB %d, calculated %#08x, read %#08x", pnum, crc, hdr_crc); if (!read_err) return UBI_IO_BAD_HDR; else return UBI_IO_BAD_HDR_EBADMSG; } /* And of course validate what has just been read from the media */ err = validate_ec_hdr(ubi, ec_hdr); if (err) { ubi_err(ubi, "validation failed for PEB %d", pnum); return -EINVAL; } /* * If there was %-EBADMSG, but the header CRC is still OK, report about * a bit-flip to force scrubbing on this PEB. */ if (read_err) return UBI_IO_BITFLIPS; if (ubi_dbg_is_read_failure(ubi, MASK_READ_FAILURE_EC)) { ubi_warn(ubi, "cannot read EC header from PEB %d (emulated)", pnum); return -EIO; } if (ubi_dbg_is_ff(ubi, MASK_IO_FF_EC)) { ubi_warn(ubi, "bit-all-ff (emulated)"); return UBI_IO_FF; } if (ubi_dbg_is_ff_bitflips(ubi, MASK_IO_FF_BITFLIPS_EC)) { ubi_warn(ubi, "bit-all-ff with error reported by MTD driver (emulated)"); return UBI_IO_FF_BITFLIPS; } if (ubi_dbg_is_bad_hdr(ubi, MASK_BAD_HDR_EC)) { ubi_warn(ubi, "bad_hdr (emulated)"); return UBI_IO_BAD_HDR; } if (ubi_dbg_is_bad_hdr_ebadmsg(ubi, MASK_BAD_HDR_EBADMSG_EC)) { ubi_warn(ubi, "bad_hdr with ECC error (emulated)"); return UBI_IO_BAD_HDR_EBADMSG; } return 0; } /** * ubi_io_write_ec_hdr - write an erase counter header. * @ubi: UBI device description object * @pnum: physical eraseblock to write to * @ec_hdr: the erase counter header to write * * This function writes erase counter header described by @ec_hdr to physical * eraseblock @pnum. It also fills most fields of @ec_hdr before writing, so * the caller do not have to fill them. Callers must only fill the @ec_hdr->ec * field. * * This function returns zero in case of success and a negative error code in * case of failure. If %-EIO is returned, the physical eraseblock most probably * went bad. */ int ubi_io_write_ec_hdr(struct ubi_device *ubi, int pnum, struct ubi_ec_hdr *ec_hdr) { int err; uint32_t crc; dbg_io("write EC header to PEB %d", pnum); ubi_assert(pnum >= 0 && pnum < ubi->peb_count); ec_hdr->magic = cpu_to_be32(UBI_EC_HDR_MAGIC); ec_hdr->version = UBI_VERSION; ec_hdr->vid_hdr_offset = cpu_to_be32(ubi->vid_hdr_offset); ec_hdr->data_offset = cpu_to_be32(ubi->leb_start); ec_hdr->image_seq = cpu_to_be32(ubi->image_seq); crc = crc32(UBI_CRC32_INIT, ec_hdr, UBI_EC_HDR_SIZE_CRC); ec_hdr->hdr_crc = cpu_to_be32(crc); err = self_check_ec_hdr(ubi, pnum, ec_hdr); if (err) return err; if (ubi_dbg_is_power_cut(ubi, MASK_POWER_CUT_EC)) { ubi_warn(ubi, "emulating a power cut when writing EC header"); ubi_ro_mode(ubi); return -EROFS; } err = ubi_io_write(ubi, ec_hdr, pnum, 0, ubi->ec_hdr_alsize); return err; } /** * validate_vid_hdr - validate a volume identifier header. * @ubi: UBI device description object * @vid_hdr: the volume identifier header to check * * This function checks that data stored in the volume identifier header * @vid_hdr. Returns zero if the VID header is OK and %1 if not. */ static int validate_vid_hdr(const struct ubi_device *ubi, const struct ubi_vid_hdr *vid_hdr) { int vol_type = vid_hdr->vol_type; int copy_flag = vid_hdr->copy_flag; int vol_id = be32_to_cpu(vid_hdr->vol_id); int lnum = be32_to_cpu(vid_hdr->lnum); int compat = vid_hdr->compat; int data_size = be32_to_cpu(vid_hdr->data_size); int used_ebs = be32_to_cpu(vid_hdr->used_ebs); int data_pad = be32_to_cpu(vid_hdr->data_pad); int data_crc = be32_to_cpu(vid_hdr->data_crc); int usable_leb_size = ubi->leb_size - data_pad; if (copy_flag != 0 && copy_flag != 1) { ubi_err(ubi, "bad copy_flag"); goto bad; } if (vol_id < 0 || lnum < 0 || data_size < 0 || used_ebs < 0 || data_pad < 0) { ubi_err(ubi, "negative values"); goto bad; } if (vol_id >= UBI_MAX_VOLUMES && vol_id < UBI_INTERNAL_VOL_START) { ubi_err(ubi, "bad vol_id"); goto bad; } if (vol_id < UBI_INTERNAL_VOL_START && compat != 0) { ubi_err(ubi, "bad compat"); goto bad; } if (vol_id >= UBI_INTERNAL_VOL_START && compat != UBI_COMPAT_DELETE && compat != UBI_COMPAT_RO && compat != UBI_COMPAT_PRESERVE && compat != UBI_COMPAT_REJECT) { ubi_err(ubi, "bad compat"); goto bad; } if (vol_type != UBI_VID_DYNAMIC && vol_type != UBI_VID_STATIC) { ubi_err(ubi, "bad vol_type"); goto bad; } if (data_pad >= ubi->leb_size / 2) { ubi_err(ubi, "bad data_pad"); goto bad; } if (data_size > ubi->leb_size) { ubi_err(ubi, "bad data_size"); goto bad; } if (vol_type == UBI_VID_STATIC) { /* * Although from high-level point of view static volumes may * contain zero bytes of data, but no VID headers can contain * zero at these fields, because they empty volumes do not have * mapped logical eraseblocks. */ if (used_ebs == 0) { ubi_err(ubi, "zero used_ebs"); goto bad; } if (data_size == 0) { ubi_err(ubi, "zero data_size"); goto bad; } if (lnum < used_ebs - 1) { if (data_size != usable_leb_size) { ubi_err(ubi, "bad data_size"); goto bad; } } else if (lnum > used_ebs - 1) { ubi_err(ubi, "too high lnum"); goto bad; } } else { if (copy_flag == 0) { if (data_crc != 0) { ubi_err(ubi, "non-zero data CRC"); goto bad; } if (data_size != 0) { ubi_err(ubi, "non-zero data_size"); goto bad; } } else { if (data_size == 0) { ubi_err(ubi, "zero data_size of copy"); goto bad; } } if (used_ebs != 0) { ubi_err(ubi, "bad used_ebs"); goto bad; } } return 0; bad: ubi_err(ubi, "bad VID header"); ubi_dump_vid_hdr(vid_hdr); dump_stack(); return 1; } /** * ubi_io_read_vid_hdr - read and check a volume identifier header. * @ubi: UBI device description object * @pnum: physical eraseblock number to read from * @vidb: the volume identifier buffer to store data in * @verbose: be verbose if the header is corrupted or wasn't found * * This function reads the volume identifier header from physical eraseblock * @pnum and stores it in @vidb. It also checks CRC checksum of the read * volume identifier header. The error codes are the same as in * 'ubi_io_read_ec_hdr()'. * * Note, the implementation of this function is also very similar to * 'ubi_io_read_ec_hdr()', so refer commentaries in 'ubi_io_read_ec_hdr()'. */ int ubi_io_read_vid_hdr(struct ubi_device *ubi, int pnum, struct ubi_vid_io_buf *vidb, int verbose) { int err, read_err; uint32_t crc, magic, hdr_crc; struct ubi_vid_hdr *vid_hdr = ubi_get_vid_hdr(vidb); void *p = vidb->buffer; dbg_io("read VID header from PEB %d", pnum); ubi_assert(pnum >= 0 && pnum < ubi->peb_count); read_err = ubi_io_read(ubi, p, pnum, ubi->vid_hdr_aloffset, ubi->vid_hdr_shift + UBI_VID_HDR_SIZE); if (read_err && read_err != UBI_IO_BITFLIPS && !mtd_is_eccerr(read_err)) return read_err; magic = be32_to_cpu(vid_hdr->magic); if (magic != UBI_VID_HDR_MAGIC) { if (mtd_is_eccerr(read_err)) return UBI_IO_BAD_HDR_EBADMSG; if (ubi_check_pattern(vid_hdr, 0xFF, UBI_VID_HDR_SIZE)) { if (verbose) ubi_warn(ubi, "no VID header found at PEB %d, only 0xFF bytes", pnum); dbg_bld("no VID header found at PEB %d, only 0xFF bytes", pnum); if (!read_err) return UBI_IO_FF; else return UBI_IO_FF_BITFLIPS; } if (verbose) { ubi_warn(ubi, "bad magic number at PEB %d: %08x instead of %08x", pnum, magic, UBI_VID_HDR_MAGIC); ubi_dump_vid_hdr(vid_hdr); } dbg_bld("bad magic number at PEB %d: %08x instead of %08x", pnum, magic, UBI_VID_HDR_MAGIC); return UBI_IO_BAD_HDR; } crc = crc32(UBI_CRC32_INIT, vid_hdr, UBI_VID_HDR_SIZE_CRC); hdr_crc = be32_to_cpu(vid_hdr->hdr_crc); if (hdr_crc != crc) { if (verbose) { ubi_warn(ubi, "bad CRC at PEB %d, calculated %#08x, read %#08x", pnum, crc, hdr_crc); ubi_dump_vid_hdr(vid_hdr); } dbg_bld("bad CRC at PEB %d, calculated %#08x, read %#08x", pnum, crc, hdr_crc); if (!read_err) return UBI_IO_BAD_HDR; else return UBI_IO_BAD_HDR_EBADMSG; } err = validate_vid_hdr(ubi, vid_hdr); if (err) { ubi_err(ubi, "validation failed for PEB %d", pnum); return -EINVAL; } if (read_err) return UBI_IO_BITFLIPS; if (ubi_dbg_is_read_failure(ubi, MASK_READ_FAILURE_VID)) { ubi_warn(ubi, "cannot read VID header from PEB %d (emulated)", pnum); return -EIO; } if (ubi_dbg_is_ff(ubi, MASK_IO_FF_VID)) { ubi_warn(ubi, "bit-all-ff (emulated)"); return UBI_IO_FF; } if (ubi_dbg_is_ff_bitflips(ubi, MASK_IO_FF_BITFLIPS_VID)) { ubi_warn(ubi, "bit-all-ff with error reported by MTD driver (emulated)"); return UBI_IO_FF_BITFLIPS; } if (ubi_dbg_is_bad_hdr(ubi, MASK_BAD_HDR_VID)) { ubi_warn(ubi, "bad_hdr (emulated)"); return UBI_IO_BAD_HDR; } if (ubi_dbg_is_bad_hdr_ebadmsg(ubi, MASK_BAD_HDR_EBADMSG_VID)) { ubi_warn(ubi, "bad_hdr with ECC error (emulated)"); return UBI_IO_BAD_HDR_EBADMSG; } return 0; } /** * ubi_io_write_vid_hdr - write a volume identifier header. * @ubi: UBI device description object * @pnum: the physical eraseblock number to write to * @vidb: the volume identifier buffer to write * * This function writes the volume identifier header described by @vid_hdr to * physical eraseblock @pnum. This function automatically fills the * @vidb->hdr->magic and the @vidb->hdr->version fields, as well as calculates * header CRC checksum and stores it at vidb->hdr->hdr_crc. * * This function returns zero in case of success and a negative error code in * case of failure. If %-EIO is returned, the physical eraseblock probably went * bad. */ int ubi_io_write_vid_hdr(struct ubi_device *ubi, int pnum, struct ubi_vid_io_buf *vidb) { struct ubi_vid_hdr *vid_hdr = ubi_get_vid_hdr(vidb); int err; uint32_t crc; void *p = vidb->buffer; dbg_io("write VID header to PEB %d", pnum); ubi_assert(pnum >= 0 && pnum < ubi->peb_count); err = self_check_peb_ec_hdr(ubi, pnum); if (err) return err; vid_hdr->magic = cpu_to_be32(UBI_VID_HDR_MAGIC); vid_hdr->version = UBI_VERSION; crc = crc32(UBI_CRC32_INIT, vid_hdr, UBI_VID_HDR_SIZE_CRC); vid_hdr->hdr_crc = cpu_to_be32(crc); err = self_check_vid_hdr(ubi, pnum, vid_hdr); if (err) return err; if (ubi_dbg_is_power_cut(ubi, MASK_POWER_CUT_VID)) { ubi_warn(ubi, "emulating a power cut when writing VID header"); ubi_ro_mode(ubi); return -EROFS; } err = ubi_io_write(ubi, p, pnum, ubi->vid_hdr_aloffset, ubi->vid_hdr_alsize); return err; } /** * self_check_not_bad - ensure that a physical eraseblock is not bad. * @ubi: UBI device description object * @pnum: physical eraseblock number to check * * This function returns zero if the physical eraseblock is good, %-EINVAL if * it is bad and a negative error code if an error occurred. */ static int self_check_not_bad(const struct ubi_device *ubi, int pnum) { int err; if (!ubi_dbg_chk_io(ubi)) return 0; err = ubi_io_is_bad(ubi, pnum); if (!err) return err; ubi_err(ubi, "self-check failed for PEB %d", pnum); dump_stack(); return err > 0 ? -EINVAL : err; } /** * self_check_ec_hdr - check if an erase counter header is all right. * @ubi: UBI device description object * @pnum: physical eraseblock number the erase counter header belongs to * @ec_hdr: the erase counter header to check * * This function returns zero if the erase counter header contains valid * values, and %-EINVAL if not. */ static int self_check_ec_hdr(const struct ubi_device *ubi, int pnum, const struct ubi_ec_hdr *ec_hdr) { int err; uint32_t magic; if (!ubi_dbg_chk_io(ubi)) return 0; magic = be32_to_cpu(ec_hdr->magic); if (magic != UBI_EC_HDR_MAGIC) { ubi_err(ubi, "bad magic %#08x, must be %#08x", magic, UBI_EC_HDR_MAGIC); goto fail; } err = validate_ec_hdr(ubi, ec_hdr); if (err) { ubi_err(ubi, "self-check failed for PEB %d", pnum); goto fail; } return 0; fail: ubi_dump_ec_hdr(ec_hdr); dump_stack(); return -EINVAL; } /** * self_check_peb_ec_hdr - check erase counter header. * @ubi: UBI device description object * @pnum: the physical eraseblock number to check * * This function returns zero if the erase counter header is all right and * a negative error code if not or if an error occurred. */ static int self_check_peb_ec_hdr(const struct ubi_device *ubi, int pnum) { int err; uint32_t crc, hdr_crc; struct ubi_ec_hdr *ec_hdr; if (!ubi_dbg_chk_io(ubi)) return 0; ec_hdr = kzalloc(ubi->ec_hdr_alsize, GFP_NOFS); if (!ec_hdr) return -ENOMEM; err = ubi_io_read(ubi, ec_hdr, pnum, 0, UBI_EC_HDR_SIZE); if (err && err != UBI_IO_BITFLIPS && !mtd_is_eccerr(err)) goto exit; crc = crc32(UBI_CRC32_INIT, ec_hdr, UBI_EC_HDR_SIZE_CRC); hdr_crc = be32_to_cpu(ec_hdr->hdr_crc); if (hdr_crc != crc) { ubi_err(ubi, "bad CRC, calculated %#08x, read %#08x", crc, hdr_crc); ubi_err(ubi, "self-check failed for PEB %d", pnum); ubi_dump_ec_hdr(ec_hdr); dump_stack(); err = -EINVAL; goto exit; } err = self_check_ec_hdr(ubi, pnum, ec_hdr); exit: kfree(ec_hdr); return err; } /** * self_check_vid_hdr - check that a volume identifier header is all right. * @ubi: UBI device description object * @pnum: physical eraseblock number the volume identifier header belongs to * @vid_hdr: the volume identifier header to check * * This function returns zero if the volume identifier header is all right, and * %-EINVAL if not. */ static int self_check_vid_hdr(const struct ubi_device *ubi, int pnum, const struct ubi_vid_hdr *vid_hdr) { int err; uint32_t magic; if (!ubi_dbg_chk_io(ubi)) return 0; magic = be32_to_cpu(vid_hdr->magic); if (magic != UBI_VID_HDR_MAGIC) { ubi_err(ubi, "bad VID header magic %#08x at PEB %d, must be %#08x", magic, pnum, UBI_VID_HDR_MAGIC); goto fail; } err = validate_vid_hdr(ubi, vid_hdr); if (err) { ubi_err(ubi, "self-check failed for PEB %d", pnum); goto fail; } return err; fail: ubi_err(ubi, "self-check failed for PEB %d", pnum); ubi_dump_vid_hdr(vid_hdr); dump_stack(); return -EINVAL; } /** * self_check_peb_vid_hdr - check volume identifier header. * @ubi: UBI device description object * @pnum: the physical eraseblock number to check * * This function returns zero if the volume identifier header is all right, * and a negative error code if not or if an error occurred. */ static int self_check_peb_vid_hdr(const struct ubi_device *ubi, int pnum) { int err; uint32_t crc, hdr_crc; struct ubi_vid_io_buf *vidb; struct ubi_vid_hdr *vid_hdr; void *p; if (!ubi_dbg_chk_io(ubi)) return 0; vidb = ubi_alloc_vid_buf(ubi, GFP_NOFS); if (!vidb) return -ENOMEM; vid_hdr = ubi_get_vid_hdr(vidb); p = vidb->buffer; err = ubi_io_read(ubi, p, pnum, ubi->vid_hdr_aloffset, ubi->vid_hdr_alsize); if (err && err != UBI_IO_BITFLIPS && !mtd_is_eccerr(err)) goto exit; crc = crc32(UBI_CRC32_INIT, vid_hdr, UBI_VID_HDR_SIZE_CRC); hdr_crc = be32_to_cpu(vid_hdr->hdr_crc); if (hdr_crc != crc) { ubi_err(ubi, "bad VID header CRC at PEB %d, calculated %#08x, read %#08x", pnum, crc, hdr_crc); ubi_err(ubi, "self-check failed for PEB %d", pnum); ubi_dump_vid_hdr(vid_hdr); dump_stack(); err = -EINVAL; goto exit; } err = self_check_vid_hdr(ubi, pnum, vid_hdr); exit: ubi_free_vid_buf(vidb); return err; } /** * self_check_write - make sure write succeeded. * @ubi: UBI device description object * @buf: buffer with data which were written * @pnum: physical eraseblock number the data were written to * @offset: offset within the physical eraseblock the data were written to * @len: how many bytes were written * * This functions reads data which were recently written and compares it with * the original data buffer - the data have to match. Returns zero if the data * match and a negative error code if not or in case of failure. */ static int self_check_write(struct ubi_device *ubi, const void *buf, int pnum, int offset, int len) { int err, i; size_t read; void *buf1; loff_t addr = (loff_t)pnum * ubi->peb_size + offset; if (!ubi_dbg_chk_io(ubi)) return 0; buf1 = __vmalloc(len, GFP_NOFS); if (!buf1) { ubi_err(ubi, "cannot allocate memory to check writes"); return 0; } err = mtd_read(ubi->mtd, addr, len, &read, buf1); if (err && !mtd_is_bitflip(err)) goto out_free; for (i = 0; i < len; i++) { uint8_t c = ((uint8_t *)buf)[i]; uint8_t c1 = ((uint8_t *)buf1)[i]; int dump_len; if (c == c1) continue; ubi_err(ubi, "self-check failed for PEB %d:%d, len %d", pnum, offset, len); ubi_msg(ubi, "data differ at position %d", i); dump_len = max_t(int, 128, len - i); ubi_msg(ubi, "hex dump of the original buffer from %d to %d", i, i + dump_len); print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1, buf + i, dump_len, 1); ubi_msg(ubi, "hex dump of the read buffer from %d to %d", i, i + dump_len); print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1, buf1 + i, dump_len, 1); dump_stack(); err = -EINVAL; goto out_free; } vfree(buf1); return 0; out_free: vfree(buf1); return err; } /** * ubi_self_check_all_ff - check that a region of flash is empty. * @ubi: UBI device description object * @pnum: the physical eraseblock number to check * @offset: the starting offset within the physical eraseblock to check * @len: the length of the region to check * * This function returns zero if only 0xFF bytes are present at offset * @offset of the physical eraseblock @pnum, and a negative error code if not * or if an error occurred. */ int ubi_self_check_all_ff(struct ubi_device *ubi, int pnum, int offset, int len) { size_t read; int err; void *buf; loff_t addr = (loff_t)pnum * ubi->peb_size + offset; if (!ubi_dbg_chk_io(ubi)) return 0; buf = __vmalloc(len, GFP_NOFS); if (!buf) { ubi_err(ubi, "cannot allocate memory to check for 0xFFs"); return 0; } err = mtd_read(ubi->mtd, addr, len, &read, buf); if (err && !mtd_is_bitflip(err)) { ubi_err(ubi, "err %d while reading %d bytes from PEB %d:%d, read %zd bytes", err, len, pnum, offset, read); goto error; } err = ubi_check_pattern(buf, 0xFF, len); if (err == 0) { ubi_err(ubi, "flash region at PEB %d:%d, length %d does not contain all 0xFF bytes", pnum, offset, len); goto fail; } vfree(buf); return 0; fail: ubi_err(ubi, "self-check failed for PEB %d", pnum); ubi_msg(ubi, "hex dump of the %d-%d region", offset, offset + len); print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1, buf, len, 1); err = -EINVAL; error: dump_stack(); vfree(buf); return err; }
4 4 4 4 4 4 4 4 4 4 4 4 4 5 4 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 // SPDX-License-Identifier: GPL-2.0-only /* * CAN driver for PEAK System PCAN-USB adapter * Derived from the PCAN project file driver/src/pcan_usb.c * * Copyright (C) 2003-2010 PEAK System-Technik GmbH * Copyright (C) 2011-2012 Stephane Grosjean <s.grosjean@peak-system.com> * * Many thanks to Klaus Hitschler <klaus.hitschler@gmx.de> */ #include <linux/unaligned.h> #include <linux/ethtool.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/usb.h> #include <linux/can.h> #include <linux/can/dev.h> #include <linux/can/error.h> #include "pcan_usb_core.h" /* PCAN-USB Endpoints */ #define PCAN_USB_EP_CMDOUT 1 #define PCAN_USB_EP_CMDIN (PCAN_USB_EP_CMDOUT | USB_DIR_IN) #define PCAN_USB_EP_MSGOUT 2 #define PCAN_USB_EP_MSGIN (PCAN_USB_EP_MSGOUT | USB_DIR_IN) /* PCAN-USB command struct */ #define PCAN_USB_CMD_FUNC 0 #define PCAN_USB_CMD_NUM 1 #define PCAN_USB_CMD_ARGS 2 #define PCAN_USB_CMD_ARGS_LEN 14 #define PCAN_USB_CMD_LEN (PCAN_USB_CMD_ARGS + \ PCAN_USB_CMD_ARGS_LEN) /* PCAN-USB commands */ #define PCAN_USB_CMD_BITRATE 1 #define PCAN_USB_CMD_SET_BUS 3 #define PCAN_USB_CMD_DEVID 4 #define PCAN_USB_CMD_SN 6 #define PCAN_USB_CMD_REGISTER 9 #define PCAN_USB_CMD_EXT_VCC 10 #define PCAN_USB_CMD_ERR_FR 11 #define PCAN_USB_CMD_LED 12 /* PCAN_USB_CMD_SET_BUS number arg */ #define PCAN_USB_BUS_XCVER 2 #define PCAN_USB_BUS_SILENT_MODE 3 /* PCAN_USB_CMD_xxx functions */ #define PCAN_USB_GET 1 #define PCAN_USB_SET 2 /* PCAN-USB command timeout (ms.) */ #define PCAN_USB_COMMAND_TIMEOUT 1000 /* PCAN-USB startup timeout (ms.) */ #define PCAN_USB_STARTUP_TIMEOUT 10 /* PCAN-USB rx/tx buffers size */ #define PCAN_USB_RX_BUFFER_SIZE 64 #define PCAN_USB_TX_BUFFER_SIZE 64 #define PCAN_USB_MSG_HEADER_LEN 2 #define PCAN_USB_MSG_TX_CAN 2 /* Tx msg is a CAN frame */ /* PCAN-USB adapter internal clock (MHz) */ #define PCAN_USB_CRYSTAL_HZ 16000000 /* PCAN-USB USB message record status/len field */ #define PCAN_USB_STATUSLEN_TIMESTAMP (1 << 7) #define PCAN_USB_STATUSLEN_INTERNAL (1 << 6) #define PCAN_USB_STATUSLEN_EXT_ID (1 << 5) #define PCAN_USB_STATUSLEN_RTR (1 << 4) #define PCAN_USB_STATUSLEN_DLC (0xf) /* PCAN-USB 4.1 CAN Id tx extended flags */ #define PCAN_USB_TX_SRR 0x01 /* SJA1000 SRR command */ #define PCAN_USB_TX_AT 0x02 /* SJA1000 AT command */ /* PCAN-USB error flags */ #define PCAN_USB_ERROR_TXFULL 0x01 #define PCAN_USB_ERROR_RXQOVR 0x02 #define PCAN_USB_ERROR_BUS_LIGHT 0x04 #define PCAN_USB_ERROR_BUS_HEAVY 0x08 #define PCAN_USB_ERROR_BUS_OFF 0x10 #define PCAN_USB_ERROR_RXQEMPTY 0x20 #define PCAN_USB_ERROR_QOVR 0x40 #define PCAN_USB_ERROR_TXQFULL 0x80 #define PCAN_USB_ERROR_BUS (PCAN_USB_ERROR_BUS_LIGHT | \ PCAN_USB_ERROR_BUS_HEAVY | \ PCAN_USB_ERROR_BUS_OFF) /* SJA1000 modes */ #define SJA1000_MODE_NORMAL 0x00 #define SJA1000_MODE_INIT 0x01 /* * tick duration = 42.666 us => * (tick_number * 44739243) >> 20 ~ (tick_number * 42666) / 1000 * accuracy = 10^-7 */ #define PCAN_USB_TS_DIV_SHIFTER 20 #define PCAN_USB_TS_US_PER_TICK 44739243 /* PCAN-USB messages record types */ #define PCAN_USB_REC_ERROR 1 #define PCAN_USB_REC_ANALOG 2 #define PCAN_USB_REC_BUSLOAD 3 #define PCAN_USB_REC_TS 4 #define PCAN_USB_REC_BUSEVT 5 /* CAN bus events notifications selection mask */ #define PCAN_USB_ERR_RXERR 0x02 /* ask for rxerr counter */ #define PCAN_USB_ERR_TXERR 0x04 /* ask for txerr counter */ /* This mask generates an usb packet each time the state of the bus changes. * In other words, its interest is to know which side among rx and tx is * responsible of the change of the bus state. */ #define PCAN_USB_BERR_MASK (PCAN_USB_ERR_RXERR | PCAN_USB_ERR_TXERR) /* identify bus event packets with rx/tx error counters */ #define PCAN_USB_ERR_CNT_DEC 0x00 /* counters are decreasing */ #define PCAN_USB_ERR_CNT_INC 0x80 /* counters are increasing */ /* private to PCAN-USB adapter */ struct pcan_usb { struct peak_usb_device dev; struct peak_time_ref time_ref; struct timer_list restart_timer; struct can_berr_counter bec; }; /* incoming message context for decoding */ struct pcan_usb_msg_context { u16 ts16; u8 prev_ts8; u8 *ptr; u8 *end; u8 rec_cnt; u8 rec_idx; u8 rec_ts_idx; struct net_device *netdev; struct pcan_usb *pdev; }; /* * send a command */ static int pcan_usb_send_cmd(struct peak_usb_device *dev, u8 f, u8 n, u8 *p) { int err; int actual_length; /* usb device unregistered? */ if (!(dev->state & PCAN_USB_STATE_CONNECTED)) return 0; dev->cmd_buf[PCAN_USB_CMD_FUNC] = f; dev->cmd_buf[PCAN_USB_CMD_NUM] = n; if (p) memcpy(dev->cmd_buf + PCAN_USB_CMD_ARGS, p, PCAN_USB_CMD_ARGS_LEN); err = usb_bulk_msg(dev->udev, usb_sndbulkpipe(dev->udev, PCAN_USB_EP_CMDOUT), dev->cmd_buf, PCAN_USB_CMD_LEN, &actual_length, PCAN_USB_COMMAND_TIMEOUT); if (err) netdev_err(dev->netdev, "sending cmd f=0x%x n=0x%x failure: %d\n", f, n, err); return err; } /* * send a command then wait for its response */ static int pcan_usb_wait_rsp(struct peak_usb_device *dev, u8 f, u8 n, u8 *p) { int err; int actual_length; /* usb device unregistered? */ if (!(dev->state & PCAN_USB_STATE_CONNECTED)) return 0; /* first, send command */ err = pcan_usb_send_cmd(dev, f, n, NULL); if (err) return err; err = usb_bulk_msg(dev->udev, usb_rcvbulkpipe(dev->udev, PCAN_USB_EP_CMDIN), dev->cmd_buf, PCAN_USB_CMD_LEN, &actual_length, PCAN_USB_COMMAND_TIMEOUT); if (err) netdev_err(dev->netdev, "waiting rsp f=0x%x n=0x%x failure: %d\n", f, n, err); else if (p) memcpy(p, dev->cmd_buf + PCAN_USB_CMD_ARGS, PCAN_USB_CMD_ARGS_LEN); return err; } static int pcan_usb_set_sja1000(struct peak_usb_device *dev, u8 mode) { u8 args[PCAN_USB_CMD_ARGS_LEN] = { [1] = mode, }; return pcan_usb_send_cmd(dev, PCAN_USB_CMD_REGISTER, PCAN_USB_SET, args); } static int pcan_usb_set_bus(struct peak_usb_device *dev, u8 onoff) { u8 args[PCAN_USB_CMD_ARGS_LEN] = { [0] = !!onoff, }; return pcan_usb_send_cmd(dev, PCAN_USB_CMD_SET_BUS, PCAN_USB_BUS_XCVER, args); } static int pcan_usb_set_silent(struct peak_usb_device *dev, u8 onoff) { u8 args[PCAN_USB_CMD_ARGS_LEN] = { [0] = !!onoff, }; return pcan_usb_send_cmd(dev, PCAN_USB_CMD_SET_BUS, PCAN_USB_BUS_SILENT_MODE, args); } /* send the cmd to be notified from bus errors */ static int pcan_usb_set_err_frame(struct peak_usb_device *dev, u8 err_mask) { u8 args[PCAN_USB_CMD_ARGS_LEN] = { [0] = err_mask, }; return pcan_usb_send_cmd(dev, PCAN_USB_CMD_ERR_FR, PCAN_USB_SET, args); } static int pcan_usb_set_ext_vcc(struct peak_usb_device *dev, u8 onoff) { u8 args[PCAN_USB_CMD_ARGS_LEN] = { [0] = !!onoff, }; return pcan_usb_send_cmd(dev, PCAN_USB_CMD_EXT_VCC, PCAN_USB_SET, args); } static int pcan_usb_set_led(struct peak_usb_device *dev, u8 onoff) { u8 args[PCAN_USB_CMD_ARGS_LEN] = { [0] = !!onoff, }; return pcan_usb_send_cmd(dev, PCAN_USB_CMD_LED, PCAN_USB_SET, args); } /* * set bittiming value to can */ static int pcan_usb_set_bittiming(struct peak_usb_device *dev, struct can_bittiming *bt) { u8 args[PCAN_USB_CMD_ARGS_LEN]; u8 btr0, btr1; btr0 = ((bt->brp - 1) & 0x3f) | (((bt->sjw - 1) & 0x3) << 6); btr1 = ((bt->prop_seg + bt->phase_seg1 - 1) & 0xf) | (((bt->phase_seg2 - 1) & 0x7) << 4); if (dev->can.ctrlmode & CAN_CTRLMODE_3_SAMPLES) btr1 |= 0x80; netdev_info(dev->netdev, "setting BTR0=0x%02x BTR1=0x%02x\n", btr0, btr1); args[0] = btr1; args[1] = btr0; return pcan_usb_send_cmd(dev, PCAN_USB_CMD_BITRATE, PCAN_USB_SET, args); } /* * init/reset can */ static int pcan_usb_write_mode(struct peak_usb_device *dev, u8 onoff) { int err; err = pcan_usb_set_bus(dev, onoff); if (err) return err; if (!onoff) { err = pcan_usb_set_sja1000(dev, SJA1000_MODE_INIT); } else { /* the PCAN-USB needs time to init */ set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(msecs_to_jiffies(PCAN_USB_STARTUP_TIMEOUT)); } return err; } /* * handle end of waiting for the device to reset */ static void pcan_usb_restart(struct timer_list *t) { struct pcan_usb *pdev = timer_container_of(pdev, t, restart_timer); struct peak_usb_device *dev = &pdev->dev; /* notify candev and netdev */ peak_usb_restart_complete(dev); } /* * handle the submission of the restart urb */ static void pcan_usb_restart_pending(struct urb *urb) { struct pcan_usb *pdev = urb->context; /* the PCAN-USB needs time to restart */ mod_timer(&pdev->restart_timer, jiffies + msecs_to_jiffies(PCAN_USB_STARTUP_TIMEOUT)); /* can delete usb resources */ peak_usb_async_complete(urb); } /* * handle asynchronous restart */ static int pcan_usb_restart_async(struct peak_usb_device *dev, struct urb *urb, u8 *buf) { struct pcan_usb *pdev = container_of(dev, struct pcan_usb, dev); if (timer_pending(&pdev->restart_timer)) return -EBUSY; /* set bus on */ buf[PCAN_USB_CMD_FUNC] = 3; buf[PCAN_USB_CMD_NUM] = 2; buf[PCAN_USB_CMD_ARGS] = 1; usb_fill_bulk_urb(urb, dev->udev, usb_sndbulkpipe(dev->udev, PCAN_USB_EP_CMDOUT), buf, PCAN_USB_CMD_LEN, pcan_usb_restart_pending, pdev); return usb_submit_urb(urb, GFP_ATOMIC); } /* * read serial number from device */ static int pcan_usb_get_serial(struct peak_usb_device *dev, u32 *serial_number) { u8 args[PCAN_USB_CMD_ARGS_LEN]; int err; err = pcan_usb_wait_rsp(dev, PCAN_USB_CMD_SN, PCAN_USB_GET, args); if (err) return err; *serial_number = le32_to_cpup((__le32 *)args); return 0; } /* * read can channel id from device */ static int pcan_usb_get_can_channel_id(struct peak_usb_device *dev, u32 *can_ch_id) { u8 args[PCAN_USB_CMD_ARGS_LEN]; int err; err = pcan_usb_wait_rsp(dev, PCAN_USB_CMD_DEVID, PCAN_USB_GET, args); if (err) netdev_err(dev->netdev, "getting can channel id failure: %d\n", err); else *can_ch_id = args[0]; return err; } /* set a new CAN channel id in the flash memory of the device */ static int pcan_usb_set_can_channel_id(struct peak_usb_device *dev, u32 can_ch_id) { u8 args[PCAN_USB_CMD_ARGS_LEN]; /* this kind of device supports 8-bit values only */ if (can_ch_id > U8_MAX) return -EINVAL; /* during the flash process the device disconnects during ~1.25 s.: * prohibit access when interface is UP */ if (dev->netdev->flags & IFF_UP) return -EBUSY; args[0] = can_ch_id; return pcan_usb_send_cmd(dev, PCAN_USB_CMD_DEVID, PCAN_USB_SET, args); } /* * update current time ref with received timestamp */ static int pcan_usb_update_ts(struct pcan_usb_msg_context *mc) { if ((mc->ptr + 2) > mc->end) return -EINVAL; mc->ts16 = get_unaligned_le16(mc->ptr); if (mc->rec_idx > 0) peak_usb_update_ts_now(&mc->pdev->time_ref, mc->ts16); else peak_usb_set_ts_now(&mc->pdev->time_ref, mc->ts16); return 0; } /* * decode received timestamp */ static int pcan_usb_decode_ts(struct pcan_usb_msg_context *mc, u8 first_packet) { /* only 1st packet supplies a word timestamp */ if (first_packet) { if ((mc->ptr + 2) > mc->end) return -EINVAL; mc->ts16 = get_unaligned_le16(mc->ptr); mc->prev_ts8 = mc->ts16 & 0x00ff; mc->ptr += 2; } else { u8 ts8; if ((mc->ptr + 1) > mc->end) return -EINVAL; ts8 = *mc->ptr++; if (ts8 < mc->prev_ts8) mc->ts16 += 0x100; mc->ts16 &= 0xff00; mc->ts16 |= ts8; mc->prev_ts8 = ts8; } return 0; } static int pcan_usb_decode_error(struct pcan_usb_msg_context *mc, u8 n, u8 status_len) { struct sk_buff *skb; struct can_frame *cf; enum can_state new_state = CAN_STATE_ERROR_ACTIVE; /* ignore this error until 1st ts received */ if (n == PCAN_USB_ERROR_QOVR) if (!mc->pdev->time_ref.tick_count) return 0; /* allocate an skb to store the error frame */ skb = alloc_can_err_skb(mc->netdev, &cf); if (n & PCAN_USB_ERROR_RXQOVR) { /* data overrun interrupt */ netdev_dbg(mc->netdev, "data overrun interrupt\n"); mc->netdev->stats.rx_over_errors++; mc->netdev->stats.rx_errors++; if (cf) { cf->can_id |= CAN_ERR_CRTL; cf->data[1] |= CAN_ERR_CRTL_RX_OVERFLOW; } } if (n & PCAN_USB_ERROR_TXQFULL) netdev_dbg(mc->netdev, "device Tx queue full)\n"); if (n & PCAN_USB_ERROR_BUS_OFF) { new_state = CAN_STATE_BUS_OFF; } else if (n & PCAN_USB_ERROR_BUS_HEAVY) { new_state = ((mc->pdev->bec.txerr >= 128) || (mc->pdev->bec.rxerr >= 128)) ? CAN_STATE_ERROR_PASSIVE : CAN_STATE_ERROR_WARNING; } else { new_state = CAN_STATE_ERROR_ACTIVE; } /* handle change of state */ if (new_state != mc->pdev->dev.can.state) { enum can_state tx_state = (mc->pdev->bec.txerr >= mc->pdev->bec.rxerr) ? new_state : 0; enum can_state rx_state = (mc->pdev->bec.txerr <= mc->pdev->bec.rxerr) ? new_state : 0; can_change_state(mc->netdev, cf, tx_state, rx_state); if (new_state == CAN_STATE_BUS_OFF) { can_bus_off(mc->netdev); } else if (cf && (cf->can_id & CAN_ERR_CRTL)) { /* Supply TX/RX error counters in case of * controller error. */ cf->can_id = CAN_ERR_CNT; cf->data[6] = mc->pdev->bec.txerr; cf->data[7] = mc->pdev->bec.rxerr; } } if (!skb) return -ENOMEM; if (status_len & PCAN_USB_STATUSLEN_TIMESTAMP) { struct skb_shared_hwtstamps *hwts = skb_hwtstamps(skb); peak_usb_get_ts_time(&mc->pdev->time_ref, mc->ts16, &hwts->hwtstamp); } netif_rx(skb); return 0; } /* decode bus event usb packet: first byte contains rxerr while 2nd one contains * txerr. */ static int pcan_usb_handle_bus_evt(struct pcan_usb_msg_context *mc, u8 ir) { struct pcan_usb *pdev = mc->pdev; /* according to the content of the packet */ switch (ir) { case PCAN_USB_ERR_CNT_DEC: case PCAN_USB_ERR_CNT_INC: /* save rx/tx error counters from in the device context */ pdev->bec.rxerr = mc->ptr[1]; pdev->bec.txerr = mc->ptr[2]; break; default: /* reserved */ break; } return 0; } /* * decode non-data usb message */ static int pcan_usb_decode_status(struct pcan_usb_msg_context *mc, u8 status_len) { u8 rec_len = status_len & PCAN_USB_STATUSLEN_DLC; u8 f, n; int err; /* check whether function and number can be read */ if ((mc->ptr + 2) > mc->end) return -EINVAL; f = mc->ptr[PCAN_USB_CMD_FUNC]; n = mc->ptr[PCAN_USB_CMD_NUM]; mc->ptr += PCAN_USB_CMD_ARGS; if (status_len & PCAN_USB_STATUSLEN_TIMESTAMP) { int err = pcan_usb_decode_ts(mc, !mc->rec_ts_idx); if (err) return err; /* Next packet in the buffer will have a timestamp on a single * byte */ mc->rec_ts_idx++; } switch (f) { case PCAN_USB_REC_ERROR: err = pcan_usb_decode_error(mc, n, status_len); if (err) return err; break; case PCAN_USB_REC_ANALOG: /* analog values (ignored) */ rec_len = 2; break; case PCAN_USB_REC_BUSLOAD: /* bus load (ignored) */ rec_len = 1; break; case PCAN_USB_REC_TS: /* only timestamp */ if (pcan_usb_update_ts(mc)) return -EINVAL; break; case PCAN_USB_REC_BUSEVT: /* bus event notifications (get rxerr/txerr) */ err = pcan_usb_handle_bus_evt(mc, n); if (err) return err; break; default: netdev_err(mc->netdev, "unexpected function %u\n", f); break; } if ((mc->ptr + rec_len) > mc->end) return -EINVAL; mc->ptr += rec_len; return 0; } /* * decode data usb message */ static int pcan_usb_decode_data(struct pcan_usb_msg_context *mc, u8 status_len) { u8 rec_len = status_len & PCAN_USB_STATUSLEN_DLC; struct sk_buff *skb; struct can_frame *cf; struct skb_shared_hwtstamps *hwts; u32 can_id_flags; skb = alloc_can_skb(mc->netdev, &cf); if (!skb) return -ENOMEM; if (status_len & PCAN_USB_STATUSLEN_EXT_ID) { if ((mc->ptr + 4) > mc->end) goto decode_failed; can_id_flags = get_unaligned_le32(mc->ptr); cf->can_id = can_id_flags >> 3 | CAN_EFF_FLAG; mc->ptr += 4; } else { if ((mc->ptr + 2) > mc->end) goto decode_failed; can_id_flags = get_unaligned_le16(mc->ptr); cf->can_id = can_id_flags >> 5; mc->ptr += 2; } can_frame_set_cc_len(cf, rec_len, mc->pdev->dev.can.ctrlmode); /* Only first packet timestamp is a word */ if (pcan_usb_decode_ts(mc, !mc->rec_ts_idx)) goto decode_failed; /* Next packet in the buffer will have a timestamp on a single byte */ mc->rec_ts_idx++; /* read data */ memset(cf->data, 0x0, sizeof(cf->data)); if (status_len & PCAN_USB_STATUSLEN_RTR) { cf->can_id |= CAN_RTR_FLAG; } else { if ((mc->ptr + rec_len) > mc->end) goto decode_failed; memcpy(cf->data, mc->ptr, cf->len); mc->ptr += rec_len; /* Ignore next byte (client private id) if SRR bit is set */ if (can_id_flags & PCAN_USB_TX_SRR) mc->ptr++; /* update statistics */ mc->netdev->stats.rx_bytes += cf->len; } mc->netdev->stats.rx_packets++; /* convert timestamp into kernel time */ hwts = skb_hwtstamps(skb); peak_usb_get_ts_time(&mc->pdev->time_ref, mc->ts16, &hwts->hwtstamp); /* push the skb */ netif_rx(skb); return 0; decode_failed: dev_kfree_skb(skb); return -EINVAL; } /* * process incoming message */ static int pcan_usb_decode_msg(struct peak_usb_device *dev, u8 *ibuf, u32 lbuf) { struct pcan_usb_msg_context mc = { .rec_cnt = ibuf[1], .ptr = ibuf + PCAN_USB_MSG_HEADER_LEN, .end = ibuf + lbuf, .netdev = dev->netdev, .pdev = container_of(dev, struct pcan_usb, dev), }; int err; for (err = 0; mc.rec_idx < mc.rec_cnt && !err; mc.rec_idx++) { u8 sl = *mc.ptr++; /* handle status and error frames here */ if (sl & PCAN_USB_STATUSLEN_INTERNAL) { err = pcan_usb_decode_status(&mc, sl); /* handle normal can frames here */ } else { err = pcan_usb_decode_data(&mc, sl); } } return err; } /* * process any incoming buffer */ static int pcan_usb_decode_buf(struct peak_usb_device *dev, struct urb *urb) { int err = 0; if (urb->actual_length > PCAN_USB_MSG_HEADER_LEN) { err = pcan_usb_decode_msg(dev, urb->transfer_buffer, urb->actual_length); } else if (urb->actual_length > 0) { netdev_err(dev->netdev, "usb message length error (%u)\n", urb->actual_length); err = -EINVAL; } return err; } /* * process outgoing packet */ static int pcan_usb_encode_msg(struct peak_usb_device *dev, struct sk_buff *skb, u8 *obuf, size_t *size) { struct net_device *netdev = dev->netdev; struct net_device_stats *stats = &netdev->stats; struct can_frame *cf = (struct can_frame *)skb->data; u32 can_id_flags = cf->can_id & CAN_ERR_MASK; u8 *pc; obuf[0] = PCAN_USB_MSG_TX_CAN; obuf[1] = 1; /* only one CAN frame is stored in the packet */ pc = obuf + PCAN_USB_MSG_HEADER_LEN; /* status/len byte */ *pc = can_get_cc_dlc(cf, dev->can.ctrlmode); if (cf->can_id & CAN_RTR_FLAG) *pc |= PCAN_USB_STATUSLEN_RTR; /* can id */ if (cf->can_id & CAN_EFF_FLAG) { *pc |= PCAN_USB_STATUSLEN_EXT_ID; pc++; can_id_flags <<= 3; if (dev->can.ctrlmode & CAN_CTRLMODE_LOOPBACK) can_id_flags |= PCAN_USB_TX_SRR; if (dev->can.ctrlmode & CAN_CTRLMODE_ONE_SHOT) can_id_flags |= PCAN_USB_TX_AT; put_unaligned_le32(can_id_flags, pc); pc += 4; } else { pc++; can_id_flags <<= 5; if (dev->can.ctrlmode & CAN_CTRLMODE_LOOPBACK) can_id_flags |= PCAN_USB_TX_SRR; if (dev->can.ctrlmode & CAN_CTRLMODE_ONE_SHOT) can_id_flags |= PCAN_USB_TX_AT; put_unaligned_le16(can_id_flags, pc); pc += 2; } /* can data */ if (!(cf->can_id & CAN_RTR_FLAG)) { memcpy(pc, cf->data, cf->len); pc += cf->len; } /* SRR bit needs a writer id (useless here) */ if (can_id_flags & PCAN_USB_TX_SRR) *pc++ = 0x80; obuf[(*size)-1] = (u8)(stats->tx_packets & 0xff); return 0; } /* socket callback used to copy berr counters values received through USB */ static int pcan_usb_get_berr_counter(const struct net_device *netdev, struct can_berr_counter *bec) { struct peak_usb_device *dev = netdev_priv(netdev); struct pcan_usb *pdev = container_of(dev, struct pcan_usb, dev); *bec = pdev->bec; /* must return 0 */ return 0; } /* * start interface */ static int pcan_usb_start(struct peak_usb_device *dev) { struct pcan_usb *pdev = container_of(dev, struct pcan_usb, dev); int err; /* number of bits used in timestamps read from adapter struct */ peak_usb_init_time_ref(&pdev->time_ref, &pcan_usb); pdev->bec.rxerr = 0; pdev->bec.txerr = 0; /* always ask the device for BERR reporting, to be able to switch from * WARNING to PASSIVE state */ err = pcan_usb_set_err_frame(dev, PCAN_USB_BERR_MASK); if (err) netdev_warn(dev->netdev, "Asking for BERR reporting error %u\n", err); /* if revision greater than 3, can put silent mode on/off */ if (dev->device_rev > 3) { err = pcan_usb_set_silent(dev, dev->can.ctrlmode & CAN_CTRLMODE_LISTENONLY); if (err) return err; } return pcan_usb_set_ext_vcc(dev, 0); } static int pcan_usb_init(struct peak_usb_device *dev) { struct pcan_usb *pdev = container_of(dev, struct pcan_usb, dev); u32 serial_number; int err; /* initialize a timer needed to wait for hardware restart */ timer_setup(&pdev->restart_timer, pcan_usb_restart, 0); /* * explicit use of dev_xxx() instead of netdev_xxx() here: * information displayed are related to the device itself, not * to the canx netdevice. */ err = pcan_usb_get_serial(dev, &serial_number); if (err) { dev_err(dev->netdev->dev.parent, "unable to read %s serial number (err %d)\n", pcan_usb.name, err); return err; } dev_info(dev->netdev->dev.parent, "PEAK-System %s adapter hwrev %u serial %08X (%u channel)\n", pcan_usb.name, dev->device_rev, serial_number, pcan_usb.ctrl_count); /* Since rev 4.1, PCAN-USB is able to make single-shot as well as * looped back frames. */ if (dev->device_rev >= 41) { struct can_priv *priv = netdev_priv(dev->netdev); priv->ctrlmode_supported |= CAN_CTRLMODE_ONE_SHOT | CAN_CTRLMODE_LOOPBACK; } else { dev_info(dev->netdev->dev.parent, "Firmware update available. Please contact support@peak-system.com\n"); } return 0; } /* * probe function for new PCAN-USB usb interface */ static int pcan_usb_probe(struct usb_interface *intf) { struct usb_host_interface *if_desc; int i; if_desc = intf->altsetting; /* check interface endpoint addresses */ for (i = 0; i < if_desc->desc.bNumEndpoints; i++) {