Total coverage: 318498 (17%)of 1898608
9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 // SPDX-License-Identifier: GPL-2.0+ /* * Driver for USB Mass Storage compliant devices * SCSI layer glue code * * Current development and maintenance by: * (c) 1999-2002 Matthew Dharm (mdharm-usb@one-eyed-alien.net) * * Developed with the assistance of: * (c) 2000 David L. Brown, Jr. (usb-storage@davidb.org) * (c) 2000 Stephen J. Gowdy (SGowdy@lbl.gov) * * Initial work by: * (c) 1999 Michael Gee (michael@linuxspecific.com) * * This driver is based on the 'USB Mass Storage Class' document. This * describes in detail the protocol used to communicate with such * devices. Clearly, the designers had SCSI and ATAPI commands in * mind when they created this document. The commands are all very * similar to commands in the SCSI-II and ATAPI specifications. * * It is important to note that in a number of cases this class * exhibits class-specific exemptions from the USB specification. * Notably the usage of NAK, STALL and ACK differs from the norm, in * that they are used to communicate wait, failed and OK on commands. * * Also, for certain devices, the interrupt endpoint is used to convey * status of a command. */ #include <linux/blkdev.h> #include <linux/dma-mapping.h> #include <linux/module.h> #include <linux/mutex.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_devinfo.h> #include <scsi/scsi_device.h> #include <scsi/scsi_eh.h> #include "usb.h" #include "scsiglue.h" #include "debug.h" #include "transport.h" #include "protocol.h" /* * Vendor IDs for companies that seem to include the READ CAPACITY bug * in all their devices */ #define VENDOR_ID_NOKIA 0x0421 #define VENDOR_ID_NIKON 0x04b0 #define VENDOR_ID_PENTAX 0x0a17 #define VENDOR_ID_MOTOROLA 0x22b8 /*********************************************************************** * Host functions ***********************************************************************/ static const char* host_info(struct Scsi_Host *host) { struct us_data *us = host_to_us(host); return us->scsi_name; } static int sdev_init (struct scsi_device *sdev) { struct us_data *us = host_to_us(sdev->host); /* * Set the INQUIRY transfer length to 36. We don't use any of * the extra data and many devices choke if asked for more or * less than 36 bytes. */ sdev->inquiry_len = 36; /* Tell the SCSI layer if we know there is more than one LUN */ if (us->protocol == USB_PR_BULK && us->max_lun > 0) sdev->sdev_bflags |= BLIST_FORCELUN; /* * Some USB storage devices reset if the IO advice hints grouping mode * page is queried. Hence skip that mode page. */ sdev->sdev_bflags |= BLIST_SKIP_IO_HINTS; return 0; } static int sdev_configure(struct scsi_device *sdev, struct queue_limits *lim) { struct us_data *us = host_to_us(sdev->host); struct device *dev = us->pusb_dev->bus->sysdev; /* * Many devices have trouble transferring more than 32KB at a time, * while others have trouble with more than 64K. At this time we * are limiting both to 32K (64 sectores). */ if (us->fflags & (US_FL_MAX_SECTORS_64 | US_FL_MAX_SECTORS_MIN)) { unsigned int max_sectors = 64; if (us->fflags & US_FL_MAX_SECTORS_MIN) max_sectors = PAGE_SIZE >> 9; lim->max_hw_sectors = min(lim->max_hw_sectors, max_sectors); } else if (sdev->type == TYPE_TAPE) { /* * Tapes need much higher max_sector limits, so just * raise it to the maximum possible (4 GB / 512) and * let the queue segment size sort out the real limit. */ lim->max_hw_sectors = 0x7FFFFF; } else if (us->pusb_dev->speed >= USB_SPEED_SUPER) { /* * USB3 devices will be limited to 2048 sectors. This gives us * better throughput on most devices. */ lim->max_hw_sectors = 2048; } /* * The max_hw_sectors should be up to maximum size of a mapping for * the device. Otherwise, a DMA API might fail on swiotlb environment. */ lim->max_hw_sectors = min_t(size_t, lim->max_hw_sectors, dma_max_mapping_size(dev) >> SECTOR_SHIFT); /* * We can't put these settings in sdev_init() because that gets * called before the device type is known. Consequently these * settings can't be overridden via the scsi devinfo mechanism. */ if (sdev->type == TYPE_DISK) { /* * Some vendors seem to put the READ CAPACITY bug into * all their devices -- primarily makers of cell phones * and digital cameras. Since these devices always use * flash media and can be expected to have an even number * of sectors, we will always enable the CAPACITY_HEURISTICS * flag unless told otherwise. */ switch (le16_to_cpu(us->pusb_dev->descriptor.idVendor)) { case VENDOR_ID_NOKIA: case VENDOR_ID_NIKON: case VENDOR_ID_PENTAX: case VENDOR_ID_MOTOROLA: if (!(us->fflags & (US_FL_FIX_CAPACITY | US_FL_CAPACITY_OK))) us->fflags |= US_FL_CAPACITY_HEURISTICS; break; } /* * Disk-type devices use MODE SENSE(6) if the protocol * (SubClass) is Transparent SCSI, otherwise they use * MODE SENSE(10). */ if (us->subclass != USB_SC_SCSI && us->subclass != USB_SC_CYP_ATACB) sdev->use_10_for_ms = 1; /* *Many disks only accept MODE SENSE transfer lengths of * 192 bytes (that's what Windows uses). */ sdev->use_192_bytes_for_3f = 1; /* * Some devices report generic values until the media has been * accessed. Force a READ(10) prior to querying device * characteristics. */ sdev->read_before_ms = 1; /* * Some devices don't like MODE SENSE with page=0x3f, * which is the command used for checking if a device * is write-protected. Now that we tell the sd driver * to do a 192-byte transfer with this command the * majority of devices work fine, but a few still can't * handle it. The sd driver will simply assume those * devices are write-enabled. */ if (us->fflags & US_FL_NO_WP_DETECT) sdev->skip_ms_page_3f = 1; /* * A number of devices have problems with MODE SENSE for * page x08, so we will skip it. */ sdev->skip_ms_page_8 = 1; /* * Some devices don't handle VPD pages correctly, so skip vpd * pages if not forced by SCSI layer. */ sdev->skip_vpd_pages = !sdev->try_vpd_pages; /* Do not attempt to use REPORT SUPPORTED OPERATION CODES */ sdev->no_report_opcodes = 1; /* Do not attempt to use WRITE SAME */ sdev->no_write_same = 1; /* * Some disks return the total number of blocks in response * to READ CAPACITY rather than the highest block number. * If this device makes that mistake, tell the sd driver. */ if (us->fflags & US_FL_FIX_CAPACITY) sdev->fix_capacity = 1; /* * A few disks have two indistinguishable version, one of * which reports the correct capacity and the other does not. * The sd driver has to guess which is the case. */ if (us->fflags & US_FL_CAPACITY_HEURISTICS) sdev->guess_capacity = 1; /* Some devices cannot handle READ_CAPACITY_16 */ if (us->fflags & US_FL_NO_READ_CAPACITY_16) sdev->no_read_capacity_16 = 1; /* * Many devices do not respond properly to READ_CAPACITY_16. * Tell the SCSI layer to try READ_CAPACITY_10 first. * However some USB 3.0 drive enclosures return capacity * modulo 2TB. Those must use READ_CAPACITY_16 */ if (!(us->fflags & US_FL_NEEDS_CAP16)) sdev->try_rc_10_first = 1; /* * assume SPC3 or latter devices support sense size > 18 * unless US_FL_BAD_SENSE quirk is specified. */ if (sdev->scsi_level > SCSI_SPC_2 && !(us->fflags & US_FL_BAD_SENSE)) us->fflags |= US_FL_SANE_SENSE; /* * USB-IDE bridges tend to report SK = 0x04 (Non-recoverable * Hardware Error) when any low-level error occurs, * recoverable or not. Setting this flag tells the SCSI * midlayer to retry such commands, which frequently will * succeed and fix the error. The worst this can lead to * is an occasional series of retries that will all fail. */ sdev->retry_hwerror = 1; /* * USB disks should allow restart. Some drives spin down * automatically, requiring a START-STOP UNIT command. */ sdev->allow_restart = 1; /* * Some USB cardreaders have trouble reading an sdcard's last * sector in a larger then 1 sector read, since the performance * impact is negligible we set this flag for all USB disks */ sdev->last_sector_bug = 1; /* * Enable last-sector hacks for single-target devices using * the Bulk-only transport, unless we already know the * capacity will be decremented or is correct. */ if (!(us->fflags & (US_FL_FIX_CAPACITY | US_FL_CAPACITY_OK | US_FL_SCM_MULT_TARG)) && us->protocol == USB_PR_BULK) us->use_last_sector_hacks = 1; /* Check if write cache default on flag is set or not */ if (us->fflags & US_FL_WRITE_CACHE) sdev->wce_default_on = 1; /* A few buggy USB-ATA bridges don't understand FUA */ if (us->fflags & US_FL_BROKEN_FUA) sdev->broken_fua = 1; /* Some even totally fail to indicate a cache */ if (us->fflags & US_FL_ALWAYS_SYNC) { /* don't read caching information */ sdev->skip_ms_page_8 = 1; sdev->skip_ms_page_3f = 1; /* assume sync is needed */ sdev->wce_default_on = 1; } } else { /* * Non-disk-type devices don't need to ignore any pages * or to force 192-byte transfer lengths for MODE SENSE. * But they do need to use MODE SENSE(10). */ sdev->use_10_for_ms = 1; /* Some (fake) usb cdrom devices don't like READ_DISC_INFO */ if (us->fflags & US_FL_NO_READ_DISC_INFO) sdev->no_read_disc_info = 1; } /* * The CB and CBI transports have no way to pass LUN values * other than the bits in the second byte of a CDB. But those * bits don't get set to the LUN value if the device reports * scsi_level == 0 (UNKNOWN). Hence such devices must necessarily * be single-LUN. */ if ((us->protocol == USB_PR_CB || us->protocol == USB_PR_CBI) && sdev->scsi_level == SCSI_UNKNOWN) us->max_lun = 0; /* * Some devices choke when they receive a PREVENT-ALLOW MEDIUM * REMOVAL command, so suppress those commands. */ if (us->fflags & US_FL_NOT_LOCKABLE) sdev->lockable = 0; /* * this is to satisfy the compiler, tho I don't think the * return code is ever checked anywhere. */ return 0; } static int target_alloc(struct scsi_target *starget) { struct us_data *us = host_to_us(dev_to_shost(starget->dev.parent)); /* * Some USB drives don't support REPORT LUNS, even though they * report a SCSI revision level above 2. Tell the SCSI layer * not to issue that command; it will perform a normal sequential * scan instead. */ starget->no_report_luns = 1; /* * The UFI spec treats the Peripheral Qualifier bits in an * INQUIRY result as reserved and requires devices to set them * to 0. However the SCSI spec requires these bits to be set * to 3 to indicate when a LUN is not present. * * Let the scanning code know if this target merely sets * Peripheral Device Type to 0x1f to indicate no LUN. */ if (us->subclass == USB_SC_UFI) starget->pdt_1f_for_no_lun = 1; return 0; } /* queue a command */ /* This is always called with scsi_lock(host) held */ static int queuecommand_lck(struct scsi_cmnd *srb) { void (*done)(struct scsi_cmnd *) = scsi_done; struct us_data *us = host_to_us(srb->device->host); /* check for state-transition errors */ if (us->srb != NULL) { dev_err(&us->pusb_intf->dev, "Error in %s: us->srb = %p\n", __func__, us->srb); return SCSI_MLQUEUE_HOST_BUSY; } /* fail the command if we are disconnecting */ if (test_bit(US_FLIDX_DISCONNECTING, &us->dflags)) { usb_stor_dbg(us, "Fail command during disconnect\n"); srb->result = DID_NO_CONNECT << 16; done(srb); return 0; } if ((us->fflags & US_FL_NO_ATA_1X) && (srb->cmnd[0] == ATA_12 || srb->cmnd[0] == ATA_16)) { memcpy(srb->sense_buffer, usb_stor_sense_invalidCDB, sizeof(usb_stor_sense_invalidCDB)); srb->result = SAM_STAT_CHECK_CONDITION; done(srb); return 0; } /* enqueue the command and wake up the control thread */ us->srb = srb; complete(&us->cmnd_ready); return 0; } static DEF_SCSI_QCMD(queuecommand) /*********************************************************************** * Error handling functions ***********************************************************************/ /* Command timeout and abort */ static int command_abort_matching(struct us_data *us, struct scsi_cmnd *srb_match) { /* * us->srb together with the TIMED_OUT, RESETTING, and ABORTING * bits are protected by the host lock. */ scsi_lock(us_to_host(us)); /* is there any active pending command to abort ? */ if (!us->srb) { scsi_unlock(us_to_host(us)); usb_stor_dbg(us, "-- nothing to abort\n"); return SUCCESS; } /* Does the command match the passed srb if any ? */ if (srb_match && us->srb != srb_match) { scsi_unlock(us_to_host(us)); usb_stor_dbg(us, "-- pending command mismatch\n"); return FAILED; } /* * Set the TIMED_OUT bit. Also set the ABORTING bit, but only if * a device reset isn't already in progress (to avoid interfering * with the reset). Note that we must retain the host lock while * calling usb_stor_stop_transport(); otherwise it might interfere * with an auto-reset that begins as soon as we release the lock. */ set_bit(US_FLIDX_TIMED_OUT, &us->dflags); if (!test_bit(US_FLIDX_RESETTING, &us->dflags)) { set_bit(US_FLIDX_ABORTING, &us->dflags); usb_stor_stop_transport(us); } scsi_unlock(us_to_host(us)); /* Wait for the aborted command to finish */ wait_for_completion(&us->notify); return SUCCESS; } static int command_abort(struct scsi_cmnd *srb) { struct us_data *us = host_to_us(srb->device->host); usb_stor_dbg(us, "%s called\n", __func__); return command_abort_matching(us, srb); } /* * This invokes the transport reset mechanism to reset the state of the * device */ static int device_reset(struct scsi_cmnd *srb) { struct us_data *us = host_to_us(srb->device->host); int result; usb_stor_dbg(us, "%s called\n", __func__); /* abort any pending command before reset */ command_abort_matching(us, NULL); /* lock the device pointers and do the reset */ mutex_lock(&(us->dev_mutex)); result = us->transport_reset(us); mutex_unlock(&us->dev_mutex); return result < 0 ? FAILED : SUCCESS; } /* Simulate a SCSI bus reset by resetting the device's USB port. */ static int bus_reset(struct scsi_cmnd *srb) { struct us_data *us = host_to_us(srb->device->host); int result; usb_stor_dbg(us, "%s called\n", __func__); result = usb_stor_port_reset(us); return result < 0 ? FAILED : SUCCESS; } /* * Report a driver-initiated device reset to the SCSI layer. * Calling this for a SCSI-initiated reset is unnecessary but harmless. * The caller must own the SCSI host lock. */ void usb_stor_report_device_reset(struct us_data *us) { int i; struct Scsi_Host *host = us_to_host(us); scsi_report_device_reset(host, 0, 0); if (us->fflags & US_FL_SCM_MULT_TARG) { for (i = 1; i < host->max_id; ++i) scsi_report_device_reset(host, 0, i); } } /* * Report a driver-initiated bus reset to the SCSI layer. * Calling this for a SCSI-initiated reset is unnecessary but harmless. * The caller must not own the SCSI host lock. */ void usb_stor_report_bus_reset(struct us_data *us) { struct Scsi_Host *host = us_to_host(us); scsi_lock(host); scsi_report_bus_reset(host, 0); scsi_unlock(host); } /*********************************************************************** * /proc/scsi/ functions ***********************************************************************/ static int write_info(struct Scsi_Host *host, char *buffer, int length) { /* if someone is sending us data, just throw it away */ return length; } static int show_info (struct seq_file *m, struct Scsi_Host *host) { struct us_data *us = host_to_us(host); const char *string; /* print the controller name */ seq_printf(m, " Host scsi%d: usb-storage\n", host->host_no); /* print product, vendor, and serial number strings */ if (us->pusb_dev->manufacturer) string = us->pusb_dev->manufacturer; else if (us->unusual_dev->vendorName) string = us->unusual_dev->vendorName; else string = "Unknown"; seq_printf(m, " Vendor: %s\n", string); if (us->pusb_dev->product) string = us->pusb_dev->product; else if (us->unusual_dev->productName) string = us->unusual_dev->productName; else string = "Unknown"; seq_printf(m, " Product: %s\n", string); if (us->pusb_dev->serial) string = us->pusb_dev->serial; else string = "None"; seq_printf(m, "Serial Number: %s\n", string); /* show the protocol and transport */ seq_printf(m, " Protocol: %s\n", us->protocol_name); seq_printf(m, " Transport: %s\n", us->transport_name); /* show the device flags */ seq_printf(m, " Quirks:"); #define US_FLAG(name, value) \ if (us->fflags & value) seq_printf(m, " " #name); US_DO_ALL_FLAGS #undef US_FLAG seq_putc(m, '\n'); return 0; } /*********************************************************************** * Sysfs interface ***********************************************************************/ /* Output routine for the sysfs max_sectors file */ static ssize_t max_sectors_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_device *sdev = to_scsi_device(dev); return sprintf(buf, "%u\n", queue_max_hw_sectors(sdev->request_queue)); } /* Input routine for the sysfs max_sectors file */ static ssize_t max_sectors_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_device *sdev = to_scsi_device(dev); struct queue_limits lim; unsigned short ms; int ret; if (sscanf(buf, "%hu", &ms) <= 0) return -EINVAL; lim = queue_limits_start_update(sdev->request_queue); lim.max_hw_sectors = ms; ret = queue_limits_commit_update_frozen(sdev->request_queue, &lim); if (ret) return ret; return count; } static DEVICE_ATTR_RW(max_sectors); static struct attribute *usb_sdev_attrs[] = { &dev_attr_max_sectors.attr, NULL, }; ATTRIBUTE_GROUPS(usb_sdev); /* * this defines our host template, with which we'll allocate hosts */ static const struct scsi_host_template usb_stor_host_template = { /* basic userland interface stuff */ .name = "usb-storage", .proc_name = "usb-storage", .show_info = show_info, .write_info = write_info, .info = host_info, /* command interface -- queued only */ .queuecommand = queuecommand, /* error and abort handlers */ .eh_abort_handler = command_abort, .eh_device_reset_handler = device_reset, .eh_bus_reset_handler = bus_reset, /* queue commands only, only one command per LUN */ .can_queue = 1, /* unknown initiator id */ .this_id = -1, .sdev_init = sdev_init, .sdev_configure = sdev_configure, .target_alloc = target_alloc, /* lots of sg segments can be handled */ .sg_tablesize = SG_MAX_SEGMENTS, /* * Some host controllers may have alignment requirements. * We'll play it safe by requiring 512-byte alignment always. */ .dma_alignment = 511, /* * Limit the total size of a transfer to 120 KB. * * Some devices are known to choke with anything larger. It seems like * the problem stems from the fact that original IDE controllers had * only an 8-bit register to hold the number of sectors in one transfer * and even those couldn't handle a full 256 sectors. * * Because we want to make sure we interoperate with as many devices as * possible, we will maintain a 240 sector transfer size limit for USB * Mass Storage devices. * * Tests show that other operating have similar limits with Microsoft * Windows 7 limiting transfers to 128 sectors for both USB2 and USB3 * and Apple Mac OS X 10.11 limiting transfers to 256 sectors for USB2 * and 2048 for USB3 devices. */ .max_sectors = 240, /* emulated HBA */ .emulated = 1, /* we do our own delay after a device or bus reset */ .skip_settle_delay = 1, /* sysfs device attributes */ .sdev_groups = usb_sdev_groups, /* module management */ .module = THIS_MODULE }; void usb_stor_host_template_init(struct scsi_host_template *sht, const char *name, struct module *owner) { *sht = usb_stor_host_template; sht->name = name; sht->proc_name = name; sht->module = owner; } EXPORT_SYMBOL_GPL(usb_stor_host_template_init); /* To Report "Illegal Request: Invalid Field in CDB */ unsigned char usb_stor_sense_invalidCDB[18] = { [0] = 0x70, /* current error */ [2] = ILLEGAL_REQUEST, /* Illegal Request = 0x05 */ [7] = 0x0a, /* additional length */ [12] = 0x24 /* Invalid Field in CDB */ }; EXPORT_SYMBOL_GPL(usb_stor_sense_invalidCDB);
1 672 8 16 214 121 537 4 2 1163 125 539 1 1075 204 2 20 197 654 26 11 491 1 293 293 80 28 293 38 293 5 15 18 123 27 32 32 32 1097 1622 1591 71 1084 539 122 567 5 4 133 133 6 19 11 17 229 186 402 1 6 284 3 234 235 283 223 1629 2 2 88 31 99 74 26 5 4 38 4 41 5 180 179 8 133 167 21 6 17 21 40 17 39 84 13 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_SCHED_GENERIC_H #define __NET_SCHED_GENERIC_H #include <linux/netdevice.h> #include <linux/types.h> #include <linux/rcupdate.h> #include <linux/pkt_sched.h> #include <linux/pkt_cls.h> #include <linux/percpu.h> #include <linux/dynamic_queue_limits.h> #include <linux/list.h> #include <linux/refcount.h> #include <linux/workqueue.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/atomic.h> #include <linux/hashtable.h> #include <net/gen_stats.h> #include <net/rtnetlink.h> #include <net/flow_offload.h> #include <linux/xarray.h> struct Qdisc_ops; struct qdisc_walker; struct tcf_walker; struct module; struct bpf_flow_keys; struct qdisc_rate_table { struct tc_ratespec rate; u32 data[256]; struct qdisc_rate_table *next; int refcnt; }; enum qdisc_state_t { __QDISC_STATE_SCHED, __QDISC_STATE_DEACTIVATED, __QDISC_STATE_MISSED, __QDISC_STATE_DRAINING, }; #define QDISC_STATE_MISSED BIT(__QDISC_STATE_MISSED) #define QDISC_STATE_DRAINING BIT(__QDISC_STATE_DRAINING) #define QDISC_STATE_NON_EMPTY (QDISC_STATE_MISSED | \ QDISC_STATE_DRAINING) struct qdisc_size_table { struct rcu_head rcu; struct list_head list; struct tc_sizespec szopts; int refcnt; u16 data[]; }; /* similar to sk_buff_head, but skb->prev pointer is undefined. */ struct qdisc_skb_head { struct sk_buff *head; struct sk_buff *tail; __u32 qlen; spinlock_t lock; }; struct Qdisc { int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free); struct sk_buff * (*dequeue)(struct Qdisc *sch); unsigned int flags; #define TCQ_F_BUILTIN 1 #define TCQ_F_INGRESS 2 #define TCQ_F_CAN_BYPASS 4 #define TCQ_F_MQROOT 8 #define TCQ_F_ONETXQUEUE 0x10 /* dequeue_skb() can assume all skbs are for * q->dev_queue : It can test * netif_xmit_frozen_or_stopped() before * dequeueing next packet. * Its true for MQ/MQPRIO slaves, or non * multiqueue device. */ #define TCQ_F_WARN_NONWC (1 << 16) #define TCQ_F_CPUSTATS 0x20 /* run using percpu statistics */ #define TCQ_F_NOPARENT 0x40 /* root of its hierarchy : * qdisc_tree_decrease_qlen() should stop. */ #define TCQ_F_INVISIBLE 0x80 /* invisible by default in dump */ #define TCQ_F_NOLOCK 0x100 /* qdisc does not require locking */ #define TCQ_F_OFFLOADED 0x200 /* qdisc is offloaded to HW */ #define TCQ_F_DEQUEUE_DROPS 0x400 /* ->dequeue() can drop packets in q->to_free */ u32 limit; const struct Qdisc_ops *ops; struct qdisc_size_table __rcu *stab; struct hlist_node hash; u32 handle; u32 parent; struct netdev_queue *dev_queue; struct net_rate_estimator __rcu *rate_est; struct gnet_stats_basic_sync __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; int pad; refcount_t refcnt; /* Cache line potentially dirtied in dequeue() or __netif_reschedule(). */ __cacheline_group_begin(Qdisc_read_mostly) ____cacheline_aligned; struct sk_buff_head gso_skb; struct Qdisc *next_sched; struct sk_buff_head skb_bad_txq; __cacheline_group_end(Qdisc_read_mostly); /* Fields dirtied in dequeue() fast path. */ __cacheline_group_begin(Qdisc_write) ____cacheline_aligned; struct qdisc_skb_head q; unsigned long state; struct gnet_stats_basic_sync bstats; bool running; /* must be written under qdisc spinlock */ /* Note : we only change qstats.backlog in fast path. */ struct gnet_stats_queue qstats; struct sk_buff *to_free; __cacheline_group_end(Qdisc_write); atomic_long_t defer_count ____cacheline_aligned_in_smp; struct llist_head defer_list; spinlock_t seqlock; struct rcu_head rcu; netdevice_tracker dev_tracker; struct lock_class_key root_lock_key; /* private data */ long privdata[] ____cacheline_aligned; }; static inline void qdisc_refcount_inc(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_BUILTIN) return; refcount_inc(&qdisc->refcnt); } static inline bool qdisc_refcount_dec_if_one(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_BUILTIN) return true; return refcount_dec_if_one(&qdisc->refcnt); } /* Intended to be used by unlocked users, when concurrent qdisc release is * possible. */ static inline struct Qdisc *qdisc_refcount_inc_nz(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_BUILTIN) return qdisc; if (refcount_inc_not_zero(&qdisc->refcnt)) return qdisc; return NULL; } /* For !TCQ_F_NOLOCK qdisc: callers must either call this within a qdisc * root_lock section, or provide their own memory barriers -- ordering * against qdisc_run_begin/end() atomic bit operations. */ static inline bool qdisc_is_running(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) return spin_is_locked(&qdisc->seqlock); return READ_ONCE(qdisc->running); } static inline bool nolock_qdisc_is_empty(const struct Qdisc *qdisc) { return !(READ_ONCE(qdisc->state) & QDISC_STATE_NON_EMPTY); } static inline bool qdisc_is_percpu_stats(const struct Qdisc *q) { return q->flags & TCQ_F_CPUSTATS; } static inline bool qdisc_is_empty(const struct Qdisc *qdisc) { if (qdisc_is_percpu_stats(qdisc)) return nolock_qdisc_is_empty(qdisc); return !READ_ONCE(qdisc->q.qlen); } /* For !TCQ_F_NOLOCK qdisc, qdisc_run_begin/end() must be invoked with * the qdisc root lock acquired. */ static inline bool qdisc_run_begin(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) { if (spin_trylock(&qdisc->seqlock)) return true; /* No need to insist if the MISSED flag was already set. * Note that test_and_set_bit() also gives us memory ordering * guarantees wrt potential earlier enqueue() and below * spin_trylock(), both of which are necessary to prevent races */ if (test_and_set_bit(__QDISC_STATE_MISSED, &qdisc->state)) return false; /* Try to take the lock again to make sure that we will either * grab it or the CPU that still has it will see MISSED set * when testing it in qdisc_run_end() */ return spin_trylock(&qdisc->seqlock); } if (READ_ONCE(qdisc->running)) return false; WRITE_ONCE(qdisc->running, true); return true; } static inline struct sk_buff *qdisc_run_end(struct Qdisc *qdisc) { struct sk_buff *to_free = NULL; if (qdisc->flags & TCQ_F_NOLOCK) { spin_unlock(&qdisc->seqlock); /* spin_unlock() only has store-release semantic. The unlock * and test_bit() ordering is a store-load ordering, so a full * memory barrier is needed here. */ smp_mb(); if (unlikely(test_bit(__QDISC_STATE_MISSED, &qdisc->state))) __netif_schedule(qdisc); return NULL; } if (qdisc->flags & TCQ_F_DEQUEUE_DROPS) { to_free = qdisc->to_free; if (to_free) qdisc->to_free = NULL; } WRITE_ONCE(qdisc->running, false); return to_free; } static inline bool qdisc_may_bulk(const struct Qdisc *qdisc) { return qdisc->flags & TCQ_F_ONETXQUEUE; } static inline int qdisc_avail_bulklimit(const struct netdev_queue *txq) { return netdev_queue_dql_avail(txq); } struct Qdisc_class_ops { unsigned int flags; /* Child qdisc manipulation */ struct netdev_queue * (*select_queue)(struct Qdisc *, struct tcmsg *); int (*graft)(struct Qdisc *, unsigned long cl, struct Qdisc *, struct Qdisc **, struct netlink_ext_ack *extack); struct Qdisc * (*leaf)(struct Qdisc *, unsigned long cl); void (*qlen_notify)(struct Qdisc *, unsigned long); /* Class manipulation routines */ unsigned long (*find)(struct Qdisc *, u32 classid); int (*change)(struct Qdisc *, u32, u32, struct nlattr **, unsigned long *, struct netlink_ext_ack *); int (*delete)(struct Qdisc *, unsigned long, struct netlink_ext_ack *); void (*walk)(struct Qdisc *, struct qdisc_walker * arg); /* Filter manipulation */ struct tcf_block * (*tcf_block)(struct Qdisc *sch, unsigned long arg, struct netlink_ext_ack *extack); unsigned long (*bind_tcf)(struct Qdisc *, unsigned long, u32 classid); void (*unbind_tcf)(struct Qdisc *, unsigned long); /* rtnetlink specific */ int (*dump)(struct Qdisc *, unsigned long, struct sk_buff *skb, struct tcmsg*); int (*dump_stats)(struct Qdisc *, unsigned long, struct gnet_dump *); }; /* Qdisc_class_ops flag values */ /* Implements API that doesn't require rtnl lock */ enum qdisc_class_ops_flags { QDISC_CLASS_OPS_DOIT_UNLOCKED = 1, }; struct Qdisc_ops { struct Qdisc_ops *next; const struct Qdisc_class_ops *cl_ops; char id[IFNAMSIZ]; int priv_size; unsigned int static_flags; int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free); struct sk_buff * (*dequeue)(struct Qdisc *); struct sk_buff * (*peek)(struct Qdisc *); int (*init)(struct Qdisc *sch, struct nlattr *arg, struct netlink_ext_ack *extack); void (*reset)(struct Qdisc *); void (*destroy)(struct Qdisc *); int (*change)(struct Qdisc *sch, struct nlattr *arg, struct netlink_ext_ack *extack); void (*attach)(struct Qdisc *sch); int (*change_tx_queue_len)(struct Qdisc *, unsigned int); void (*change_real_num_tx)(struct Qdisc *sch, unsigned int new_real_tx); int (*dump)(struct Qdisc *, struct sk_buff *); int (*dump_stats)(struct Qdisc *, struct gnet_dump *); void (*ingress_block_set)(struct Qdisc *sch, u32 block_index); void (*egress_block_set)(struct Qdisc *sch, u32 block_index); u32 (*ingress_block_get)(struct Qdisc *sch); u32 (*egress_block_get)(struct Qdisc *sch); struct module *owner; }; struct tcf_result { union { struct { unsigned long class; u32 classid; }; const struct tcf_proto *goto_tp; }; }; struct tcf_chain; struct tcf_proto_ops { struct list_head head; char kind[IFNAMSIZ]; int (*classify)(struct sk_buff *, const struct tcf_proto *, struct tcf_result *); int (*init)(struct tcf_proto*); void (*destroy)(struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack); void* (*get)(struct tcf_proto*, u32 handle); void (*put)(struct tcf_proto *tp, void *f); int (*change)(struct net *net, struct sk_buff *, struct tcf_proto*, unsigned long, u32 handle, struct nlattr **, void **, u32, struct netlink_ext_ack *); int (*delete)(struct tcf_proto *tp, void *arg, bool *last, bool rtnl_held, struct netlink_ext_ack *); bool (*delete_empty)(struct tcf_proto *tp); void (*walk)(struct tcf_proto *tp, struct tcf_walker *arg, bool rtnl_held); int (*reoffload)(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, void *cb_priv, struct netlink_ext_ack *extack); void (*hw_add)(struct tcf_proto *tp, void *type_data); void (*hw_del)(struct tcf_proto *tp, void *type_data); void (*bind_class)(void *, u32, unsigned long, void *, unsigned long); void * (*tmplt_create)(struct net *net, struct tcf_chain *chain, struct nlattr **tca, struct netlink_ext_ack *extack); void (*tmplt_destroy)(void *tmplt_priv); void (*tmplt_reoffload)(struct tcf_chain *chain, bool add, flow_setup_cb_t *cb, void *cb_priv); struct tcf_exts * (*get_exts)(const struct tcf_proto *tp, u32 handle); /* rtnetlink specific */ int (*dump)(struct net*, struct tcf_proto*, void *, struct sk_buff *skb, struct tcmsg*, bool); int (*terse_dump)(struct net *net, struct tcf_proto *tp, void *fh, struct sk_buff *skb, struct tcmsg *t, bool rtnl_held); int (*tmplt_dump)(struct sk_buff *skb, struct net *net, void *tmplt_priv); struct module *owner; int flags; }; /* Classifiers setting TCF_PROTO_OPS_DOIT_UNLOCKED in tcf_proto_ops->flags * are expected to implement tcf_proto_ops->delete_empty(), otherwise race * conditions can occur when filters are inserted/deleted simultaneously. */ enum tcf_proto_ops_flags { TCF_PROTO_OPS_DOIT_UNLOCKED = 1, }; struct tcf_proto { /* Fast access part */ struct tcf_proto __rcu *next; void __rcu *root; /* called under RCU BH lock*/ int (*classify)(struct sk_buff *, const struct tcf_proto *, struct tcf_result *); __be16 protocol; /* All the rest */ u32 prio; void *data; const struct tcf_proto_ops *ops; struct tcf_chain *chain; /* Lock protects tcf_proto shared state and can be used by unlocked * classifiers to protect their private data. */ spinlock_t lock; bool deleting; bool counted; bool usesw; refcount_t refcnt; struct rcu_head rcu; struct hlist_node destroy_ht_node; }; struct qdisc_skb_cb { unsigned int pkt_len; u16 pkt_segs; u16 tc_classid; #define QDISC_CB_PRIV_LEN 20 unsigned char data[QDISC_CB_PRIV_LEN]; u16 slave_dev_queue_mapping; u8 post_ct:1; u8 post_ct_snat:1; u8 post_ct_dnat:1; }; typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv); struct tcf_chain { /* Protects filter_chain. */ struct mutex filter_chain_lock; struct tcf_proto __rcu *filter_chain; struct list_head list; struct tcf_block *block; u32 index; /* chain index */ unsigned int refcnt; unsigned int action_refcnt; bool explicitly_created; bool flushing; const struct tcf_proto_ops *tmplt_ops; void *tmplt_priv; struct rcu_head rcu; }; struct tcf_block { struct xarray ports; /* datapath accessible */ /* Lock protects tcf_block and lifetime-management data of chains * attached to the block (refcnt, action_refcnt, explicitly_created). */ struct mutex lock; struct list_head chain_list; u32 index; /* block index for shared blocks */ u32 classid; /* which class this block belongs to */ refcount_t refcnt; struct net *net; struct Qdisc *q; struct rw_semaphore cb_lock; /* protects cb_list and offload counters */ struct flow_block flow_block; struct list_head owner_list; bool keep_dst; atomic_t useswcnt; atomic_t offloadcnt; /* Number of oddloaded filters */ unsigned int nooffloaddevcnt; /* Number of devs unable to do offload */ unsigned int lockeddevcnt; /* Number of devs that require rtnl lock. */ struct { struct tcf_chain *chain; struct list_head filter_chain_list; } chain0; struct rcu_head rcu; DECLARE_HASHTABLE(proto_destroy_ht, 7); struct mutex proto_destroy_lock; /* Lock for proto_destroy hashtable. */ }; struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index); static inline bool lockdep_tcf_chain_is_locked(struct tcf_chain *chain) { return lockdep_is_held(&chain->filter_chain_lock); } static inline bool lockdep_tcf_proto_is_locked(struct tcf_proto *tp) { return lockdep_is_held(&tp->lock); } #define tcf_chain_dereference(p, chain) \ rcu_dereference_protected(p, lockdep_tcf_chain_is_locked(chain)) #define tcf_proto_dereference(p, tp) \ rcu_dereference_protected(p, lockdep_tcf_proto_is_locked(tp)) static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) { struct qdisc_skb_cb *qcb; BUILD_BUG_ON(sizeof(skb->cb) < sizeof(*qcb)); BUILD_BUG_ON(sizeof(qcb->data) < sz); } static inline int qdisc_qlen(const struct Qdisc *q) { return q->q.qlen; } static inline int qdisc_qlen_sum(const struct Qdisc *q) { __u32 qlen = q->qstats.qlen; int i; if (qdisc_is_percpu_stats(q)) { for_each_possible_cpu(i) qlen += per_cpu_ptr(q->cpu_qstats, i)->qlen; } else { qlen += q->q.qlen; } return qlen; } static inline struct qdisc_skb_cb *qdisc_skb_cb(const struct sk_buff *skb) { return (struct qdisc_skb_cb *)skb->cb; } static inline spinlock_t *qdisc_lock(struct Qdisc *qdisc) { return &qdisc->q.lock; } static inline struct Qdisc *qdisc_root(const struct Qdisc *qdisc) { struct Qdisc *q = rcu_dereference_rtnl(qdisc->dev_queue->qdisc); return q; } static inline struct Qdisc *qdisc_root_bh(const struct Qdisc *qdisc) { return rcu_dereference_bh(qdisc->dev_queue->qdisc); } static inline struct Qdisc *qdisc_root_sleeping(const struct Qdisc *qdisc) { return rcu_dereference_rtnl(qdisc->dev_queue->qdisc_sleeping); } static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc) { struct Qdisc *root = qdisc_root_sleeping(qdisc); ASSERT_RTNL(); return qdisc_lock(root); } static inline struct net_device *qdisc_dev(const struct Qdisc *qdisc) { return qdisc->dev_queue->dev; } static inline void sch_tree_lock(struct Qdisc *q) { if (q->flags & TCQ_F_MQROOT) spin_lock_bh(qdisc_lock(q)); else spin_lock_bh(qdisc_root_sleeping_lock(q)); } static inline void sch_tree_unlock(struct Qdisc *q) { if (q->flags & TCQ_F_MQROOT) spin_unlock_bh(qdisc_lock(q)); else spin_unlock_bh(qdisc_root_sleeping_lock(q)); } extern struct Qdisc noop_qdisc; extern struct Qdisc_ops noop_qdisc_ops; extern struct Qdisc_ops pfifo_fast_ops; extern const u8 sch_default_prio2band[TC_PRIO_MAX + 1]; extern struct Qdisc_ops mq_qdisc_ops; extern struct Qdisc_ops noqueue_qdisc_ops; extern const struct Qdisc_ops *default_qdisc_ops; static inline const struct Qdisc_ops * get_default_qdisc_ops(const struct net_device *dev, int ntx) { return ntx < dev->real_num_tx_queues ? default_qdisc_ops : &pfifo_fast_ops; } struct Qdisc_class_common { u32 classid; unsigned int filter_cnt; struct hlist_node hnode; }; struct Qdisc_class_hash { struct hlist_head *hash; unsigned int hashsize; unsigned int hashmask; unsigned int hashelems; }; static inline unsigned int qdisc_class_hash(u32 id, u32 mask) { id ^= id >> 8; id ^= id >> 4; return id & mask; } static inline struct Qdisc_class_common * qdisc_class_find(const struct Qdisc_class_hash *hash, u32 id) { struct Qdisc_class_common *cl; unsigned int h; if (!id) return NULL; h = qdisc_class_hash(id, hash->hashmask); hlist_for_each_entry(cl, &hash->hash[h], hnode) { if (cl->classid == id) return cl; } return NULL; } static inline bool qdisc_class_in_use(const struct Qdisc_class_common *cl) { return cl->filter_cnt > 0; } static inline void qdisc_class_get(struct Qdisc_class_common *cl) { unsigned int res; if (check_add_overflow(cl->filter_cnt, 1, &res)) WARN(1, "Qdisc class overflow"); cl->filter_cnt = res; } static inline void qdisc_class_put(struct Qdisc_class_common *cl) { unsigned int res; if (check_sub_overflow(cl->filter_cnt, 1, &res)) WARN(1, "Qdisc class underflow"); cl->filter_cnt = res; } static inline int tc_classid_to_hwtc(struct net_device *dev, u32 classid) { u32 hwtc = TC_H_MIN(classid) - TC_H_MIN_PRIORITY; return (hwtc < netdev_get_num_tc(dev)) ? hwtc : -EINVAL; } int qdisc_class_hash_init(struct Qdisc_class_hash *); void qdisc_class_hash_insert(struct Qdisc_class_hash *, struct Qdisc_class_common *); void qdisc_class_hash_remove(struct Qdisc_class_hash *, struct Qdisc_class_common *); void qdisc_class_hash_grow(struct Qdisc *, struct Qdisc_class_hash *); void qdisc_class_hash_destroy(struct Qdisc_class_hash *); int dev_qdisc_change_tx_queue_len(struct net_device *dev); void dev_qdisc_change_real_num_tx(struct net_device *dev, unsigned int new_real_tx); void dev_init_scheduler(struct net_device *dev); void dev_shutdown(struct net_device *dev); void dev_activate(struct net_device *dev); void dev_deactivate(struct net_device *dev); void dev_deactivate_many(struct list_head *head); struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, struct Qdisc *qdisc); void qdisc_reset(struct Qdisc *qdisc); void qdisc_destroy(struct Qdisc *qdisc); void qdisc_put(struct Qdisc *qdisc); void qdisc_put_unlocked(struct Qdisc *qdisc); void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, int n, int len); #ifdef CONFIG_NET_SCHED int qdisc_offload_dump_helper(struct Qdisc *q, enum tc_setup_type type, void *type_data); void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch, struct Qdisc *new, struct Qdisc *old, enum tc_setup_type type, void *type_data, struct netlink_ext_ack *extack); #else static inline int qdisc_offload_dump_helper(struct Qdisc *q, enum tc_setup_type type, void *type_data) { q->flags &= ~TCQ_F_OFFLOADED; return 0; } static inline void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch, struct Qdisc *new, struct Qdisc *old, enum tc_setup_type type, void *type_data, struct netlink_ext_ack *extack) { } #endif void qdisc_offload_query_caps(struct net_device *dev, enum tc_setup_type type, void *caps, size_t caps_len); struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, struct netlink_ext_ack *extack); void qdisc_free(struct Qdisc *qdisc); struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, u32 parentid, struct netlink_ext_ack *extack); void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab); int skb_do_redirect(struct sk_buff *); static inline bool skb_at_tc_ingress(const struct sk_buff *skb) { #ifdef CONFIG_NET_XGRESS return skb->tc_at_ingress; #else return false; #endif } static inline bool skb_skip_tc_classify(struct sk_buff *skb) { #ifdef CONFIG_NET_CLS_ACT if (skb->tc_skip_classify) { skb->tc_skip_classify = 0; return true; } #endif return false; } /* Reset all TX qdiscs greater than index of a device. */ static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i) { struct Qdisc *qdisc; for (; i < dev->num_tx_queues; i++) { qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc); if (qdisc) { spin_lock_bh(qdisc_lock(qdisc)); qdisc_reset(qdisc); spin_unlock_bh(qdisc_lock(qdisc)); } } } /* Are all TX queues of the device empty? */ static inline bool qdisc_all_tx_empty(const struct net_device *dev) { unsigned int i; rcu_read_lock(); for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); const struct Qdisc *q = rcu_dereference(txq->qdisc); if (!qdisc_is_empty(q)) { rcu_read_unlock(); return false; } } rcu_read_unlock(); return true; } /* Are any of the TX qdiscs changing? */ static inline bool qdisc_tx_changing(const struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); if (rcu_access_pointer(txq->qdisc) != rcu_access_pointer(txq->qdisc_sleeping)) return true; } return false; } /* "noqueue" qdisc identified by not having any enqueue, see noqueue_init() */ static inline bool qdisc_txq_has_no_queue(const struct netdev_queue *txq) { struct Qdisc *qdisc = rcu_access_pointer(txq->qdisc); return qdisc->enqueue == NULL; } /* Is the device using the noop qdisc on all queues? */ static inline bool qdisc_tx_is_noop(const struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); if (rcu_access_pointer(txq->qdisc) != &noop_qdisc) return false; } return true; } static inline unsigned int qdisc_pkt_len(const struct sk_buff *skb) { return qdisc_skb_cb(skb)->pkt_len; } static inline unsigned int qdisc_pkt_segs(const struct sk_buff *skb) { u32 pkt_segs = qdisc_skb_cb(skb)->pkt_segs; DEBUG_NET_WARN_ON_ONCE(pkt_segs != (skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1)); return pkt_segs; } /* additional qdisc xmit flags (NET_XMIT_MASK in linux/netdevice.h) */ enum net_xmit_qdisc_t { __NET_XMIT_STOLEN = 0x00010000, __NET_XMIT_BYPASS = 0x00020000, }; #ifdef CONFIG_NET_CLS_ACT #define net_xmit_drop_count(e) ((e) & __NET_XMIT_STOLEN ? 0 : 1) #else #define net_xmit_drop_count(e) (1) #endif static inline void qdisc_calculate_pkt_len(struct sk_buff *skb, const struct Qdisc *sch) { #ifdef CONFIG_NET_SCHED struct qdisc_size_table *stab = rcu_dereference_bh(sch->stab); if (stab) __qdisc_calculate_pkt_len(skb, stab); #endif } static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { return sch->enqueue(skb, sch, to_free); } static inline void _bstats_update(struct gnet_stats_basic_sync *bstats, __u64 bytes, __u64 packets) { u64_stats_update_begin(&bstats->syncp); u64_stats_add(&bstats->bytes, bytes); u64_stats_add(&bstats->packets, packets); u64_stats_update_end(&bstats->syncp); } static inline void bstats_update(struct gnet_stats_basic_sync *bstats, const struct sk_buff *skb) { _bstats_update(bstats, qdisc_pkt_len(skb), qdisc_pkt_segs(skb)); } static inline void qdisc_bstats_cpu_update(struct Qdisc *sch, const struct sk_buff *skb) { bstats_update(this_cpu_ptr(sch->cpu_bstats), skb); } static inline void qdisc_bstats_update(struct Qdisc *sch, const struct sk_buff *skb) { bstats_update(&sch->bstats, skb); } static inline void qdisc_qstats_backlog_dec(struct Qdisc *sch, const struct sk_buff *skb) { sch->qstats.backlog -= qdisc_pkt_len(skb); } static inline void qdisc_qstats_cpu_backlog_dec(struct Qdisc *sch, const struct sk_buff *skb) { this_cpu_sub(sch->cpu_qstats->backlog, qdisc_pkt_len(skb)); } static inline void qdisc_qstats_backlog_inc(struct Qdisc *sch, const struct sk_buff *skb) { sch->qstats.backlog += qdisc_pkt_len(skb); } static inline void qdisc_qstats_cpu_backlog_inc(struct Qdisc *sch, const struct sk_buff *skb) { this_cpu_add(sch->cpu_qstats->backlog, qdisc_pkt_len(skb)); } static inline void qdisc_qstats_cpu_qlen_inc(struct Qdisc *sch) { this_cpu_inc(sch->cpu_qstats->qlen); } static inline void qdisc_qstats_cpu_qlen_dec(struct Qdisc *sch) { this_cpu_dec(sch->cpu_qstats->qlen); } static inline void qdisc_qstats_cpu_requeues_inc(struct Qdisc *sch) { this_cpu_inc(sch->cpu_qstats->requeues); } static inline void __qdisc_qstats_drop(struct Qdisc *sch, int count) { sch->qstats.drops += count; } static inline void qstats_drop_inc(struct gnet_stats_queue *qstats) { qstats->drops++; } static inline void qstats_overlimit_inc(struct gnet_stats_queue *qstats) { qstats->overlimits++; } static inline void qdisc_qstats_drop(struct Qdisc *sch) { qstats_drop_inc(&sch->qstats); } static inline void qdisc_qstats_cpu_drop(struct Qdisc *sch) { this_cpu_inc(sch->cpu_qstats->drops); } static inline void qdisc_qstats_overlimit(struct Qdisc *sch) { sch->qstats.overlimits++; } static inline int qdisc_qstats_copy(struct gnet_dump *d, struct Qdisc *sch) { __u32 qlen = qdisc_qlen_sum(sch); return gnet_stats_copy_queue(d, sch->cpu_qstats, &sch->qstats, qlen); } static inline void qdisc_qstats_qlen_backlog(struct Qdisc *sch, __u32 *qlen, __u32 *backlog) { struct gnet_stats_queue qstats = { 0 }; gnet_stats_add_queue(&qstats, sch->cpu_qstats, &sch->qstats); *qlen = qstats.qlen + qdisc_qlen(sch); *backlog = qstats.backlog; } static inline void qdisc_purge_queue(struct Qdisc *sch) { __u32 qlen, backlog; qdisc_qstats_qlen_backlog(sch, &qlen, &backlog); qdisc_reset(sch); qdisc_tree_reduce_backlog(sch, qlen, backlog); } static inline void __qdisc_enqueue_tail(struct sk_buff *skb, struct qdisc_skb_head *qh) { struct sk_buff *last = qh->tail; if (last) { skb->next = NULL; last->next = skb; qh->tail = skb; } else { qh->tail = skb; qh->head = skb; } qh->qlen++; } static inline int qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch) { __qdisc_enqueue_tail(skb, &sch->q); qdisc_qstats_backlog_inc(sch, skb); return NET_XMIT_SUCCESS; } static inline void __qdisc_enqueue_head(struct sk_buff *skb, struct qdisc_skb_head *qh) { skb->next = qh->head; if (!qh->head) qh->tail = skb; qh->head = skb; qh->qlen++; } static inline struct sk_buff *__qdisc_dequeue_head(struct qdisc_skb_head *qh) { struct sk_buff *skb = qh->head; if (likely(skb != NULL)) { qh->head = skb->next; qh->qlen--; if (qh->head == NULL) qh->tail = NULL; skb->next = NULL; } return skb; } static inline struct sk_buff *qdisc_dequeue_internal(struct Qdisc *sch, bool direct) { struct sk_buff *skb; skb = __skb_dequeue(&sch->gso_skb); if (skb) { sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); return skb; } if (direct) { skb = __qdisc_dequeue_head(&sch->q); if (skb) qdisc_qstats_backlog_dec(sch, skb); return skb; } else { return sch->dequeue(sch); } } static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch) { struct sk_buff *skb = __qdisc_dequeue_head(&sch->q); if (likely(skb != NULL)) { qdisc_qstats_backlog_dec(sch, skb); qdisc_bstats_update(sch, skb); } return skb; } struct tc_skb_cb { struct qdisc_skb_cb qdisc_cb; u32 drop_reason; u16 zone; /* Only valid if qdisc_skb_cb(skb)->post_ct = true */ u16 mru; }; static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb) { struct tc_skb_cb *cb = (struct tc_skb_cb *)skb->cb; BUILD_BUG_ON(sizeof(*cb) > sizeof_field(struct sk_buff, cb)); return cb; } static inline enum skb_drop_reason tcf_get_drop_reason(const struct sk_buff *skb) { return tc_skb_cb(skb)->drop_reason; } static inline void tcf_set_drop_reason(const struct sk_buff *skb, enum skb_drop_reason reason) { tc_skb_cb(skb)->drop_reason = reason; } static inline void tcf_kfree_skb_list(struct sk_buff *skb) { while (unlikely(skb)) { struct sk_buff *next = skb->next; prefetch(next); kfree_skb_reason(skb, tcf_get_drop_reason(skb)); skb = next; } } static inline void qdisc_dequeue_drop(struct Qdisc *q, struct sk_buff *skb, enum skb_drop_reason reason) { DEBUG_NET_WARN_ON_ONCE(!(q->flags & TCQ_F_DEQUEUE_DROPS)); DEBUG_NET_WARN_ON_ONCE(q->flags & TCQ_F_NOLOCK); tcf_set_drop_reason(skb, reason); skb->next = q->to_free; q->to_free = skb; } /* Instead of calling kfree_skb() while root qdisc lock is held, * queue the skb for future freeing at end of __dev_xmit_skb() */ static inline void __qdisc_drop(struct sk_buff *skb, struct sk_buff **to_free) { skb->next = *to_free; *to_free = skb; } static inline void __qdisc_drop_all(struct sk_buff *skb, struct sk_buff **to_free) { if (skb->prev) skb->prev->next = *to_free; else skb->next = *to_free; *to_free = skb; } static inline unsigned int __qdisc_queue_drop_head(struct Qdisc *sch, struct qdisc_skb_head *qh, struct sk_buff **to_free) { struct sk_buff *skb = __qdisc_dequeue_head(qh); if (likely(skb != NULL)) { unsigned int len = qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); __qdisc_drop(skb, to_free); return len; } return 0; } static inline struct sk_buff *qdisc_peek_head(struct Qdisc *sch) { const struct qdisc_skb_head *qh = &sch->q; return qh->head; } /* generic pseudo peek method for non-work-conserving qdisc */ static inline struct sk_buff *qdisc_peek_dequeued(struct Qdisc *sch) { struct sk_buff *skb = skb_peek(&sch->gso_skb); /* we can reuse ->gso_skb because peek isn't called for root qdiscs */ if (!skb) { skb = sch->dequeue(sch); if (skb) { __skb_queue_head(&sch->gso_skb, skb); /* it's still part of the queue */ qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; } } return skb; } static inline void qdisc_update_stats_at_dequeue(struct Qdisc *sch, struct sk_buff *skb) { if (qdisc_is_percpu_stats(sch)) { qdisc_qstats_cpu_backlog_dec(sch, skb); qdisc_bstats_cpu_update(sch, skb); qdisc_qstats_cpu_qlen_dec(sch); } else { qdisc_qstats_backlog_dec(sch, skb); qdisc_bstats_update(sch, skb); sch->q.qlen--; } } static inline void qdisc_update_stats_at_enqueue(struct Qdisc *sch, unsigned int pkt_len) { if (qdisc_is_percpu_stats(sch)) { qdisc_qstats_cpu_qlen_inc(sch); this_cpu_add(sch->cpu_qstats->backlog, pkt_len); } else { sch->qstats.backlog += pkt_len; sch->q.qlen++; } } /* use instead of qdisc->dequeue() for all qdiscs queried with ->peek() */ static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch) { struct sk_buff *skb = skb_peek(&sch->gso_skb); if (skb) { skb = __skb_dequeue(&sch->gso_skb); if (qdisc_is_percpu_stats(sch)) { qdisc_qstats_cpu_backlog_dec(sch, skb); qdisc_qstats_cpu_qlen_dec(sch); } else { qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; } } else { skb = sch->dequeue(sch); } return skb; } static inline void __qdisc_reset_queue(struct qdisc_skb_head *qh) { /* * We do not know the backlog in bytes of this list, it * is up to the caller to correct it */ ASSERT_RTNL(); if (qh->qlen) { rtnl_kfree_skbs(qh->head, qh->tail); qh->head = NULL; qh->tail = NULL; qh->qlen = 0; } } static inline void qdisc_reset_queue(struct Qdisc *sch) { __qdisc_reset_queue(&sch->q); } static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new, struct Qdisc **pold) { struct Qdisc *old; sch_tree_lock(sch); old = *pold; *pold = new; if (old != NULL) qdisc_purge_queue(old); sch_tree_unlock(sch); return old; } static inline void rtnl_qdisc_drop(struct sk_buff *skb, struct Qdisc *sch) { rtnl_kfree_skbs(skb, skb); qdisc_qstats_drop(sch); } static inline int qdisc_drop_cpu(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { __qdisc_drop(skb, to_free); qdisc_qstats_cpu_drop(sch); return NET_XMIT_DROP; } static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { __qdisc_drop(skb, to_free); qdisc_qstats_drop(sch); return NET_XMIT_DROP; } static inline int qdisc_drop_reason(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free, enum skb_drop_reason reason) { tcf_set_drop_reason(skb, reason); return qdisc_drop(skb, sch, to_free); } static inline int qdisc_drop_all(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { __qdisc_drop_all(skb, to_free); qdisc_qstats_drop(sch); return NET_XMIT_DROP; } struct psched_ratecfg { u64 rate_bytes_ps; /* bytes per second */ u32 mult; u16 overhead; u16 mpu; u8 linklayer; u8 shift; }; static inline u64 psched_l2t_ns(const struct psched_ratecfg *r, unsigned int len) { len += r->overhead; if (len < r->mpu) len = r->mpu; if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) return ((u64)(DIV_ROUND_UP(len,48)*53) * r->mult) >> r->shift; return ((u64)len * r->mult) >> r->shift; } void psched_ratecfg_precompute(struct psched_ratecfg *r, const struct tc_ratespec *conf, u64 rate64); static inline void psched_ratecfg_getrate(struct tc_ratespec *res, const struct psched_ratecfg *r) { memset(res, 0, sizeof(*res)); /* legacy struct tc_ratespec has a 32bit @rate field * Qdisc using 64bit rate should add new attributes * in order to maintain compatibility. */ res->rate = min_t(u64, r->rate_bytes_ps, ~0U); res->overhead = r->overhead; res->mpu = r->mpu; res->linklayer = (r->linklayer & TC_LINKLAYER_MASK); } struct psched_pktrate { u64 rate_pkts_ps; /* packets per second */ u32 mult; u8 shift; }; static inline u64 psched_pkt2t_ns(const struct psched_pktrate *r, unsigned int pkt_num) { return ((u64)pkt_num * r->mult) >> r->shift; } void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64); /* Mini Qdisc serves for specific needs of ingress/clsact Qdisc. * The fast path only needs to access filter list and to update stats */ struct mini_Qdisc { struct tcf_proto *filter_list; struct tcf_block *block; struct gnet_stats_basic_sync __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; unsigned long rcu_state; }; static inline void mini_qdisc_bstats_cpu_update(struct mini_Qdisc *miniq, const struct sk_buff *skb) { bstats_update(this_cpu_ptr(miniq->cpu_bstats), skb); } static inline void mini_qdisc_qstats_cpu_drop(struct mini_Qdisc *miniq) { this_cpu_inc(miniq->cpu_qstats->drops); } struct mini_Qdisc_pair { struct mini_Qdisc miniq1; struct mini_Qdisc miniq2; struct mini_Qdisc __rcu **p_miniq; }; void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, struct tcf_proto *tp_head); void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc, struct mini_Qdisc __rcu **p_miniq); void mini_qdisc_pair_block_init(struct mini_Qdisc_pair *miniqp, struct tcf_block *block); void mq_change_real_num_tx(struct Qdisc *sch, unsigned int new_real_tx); int sch_frag_xmit_hook(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb)); /* Make sure qdisc is no longer in SCHED state. */ static inline void qdisc_synchronize(const struct Qdisc *q) { while (test_bit(__QDISC_STATE_SCHED, &q->state)) msleep(1); } #endif
3079 2897 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 /* SPDX-License-Identifier: GPL-2.0 */ /* * Definitions related to Power Management Quality of Service (PM QoS). * * Copyright (C) 2020 Intel Corporation * * Authors: * Mark Gross <mgross@linux.intel.com> * Rafael J. Wysocki <rafael.j.wysocki@intel.com> */ #ifndef _LINUX_PM_QOS_H #define _LINUX_PM_QOS_H #include <linux/plist.h> #include <linux/notifier.h> #include <linux/device.h> enum pm_qos_flags_status { PM_QOS_FLAGS_UNDEFINED = -1, PM_QOS_FLAGS_NONE, PM_QOS_FLAGS_SOME, PM_QOS_FLAGS_ALL, }; #define PM_QOS_DEFAULT_VALUE (-1) #define PM_QOS_LATENCY_ANY S32_MAX #define PM_QOS_LATENCY_ANY_NS ((s64)PM_QOS_LATENCY_ANY * NSEC_PER_USEC) #define PM_QOS_CPU_LATENCY_DEFAULT_VALUE (2000 * USEC_PER_SEC) #define PM_QOS_RESUME_LATENCY_DEFAULT_VALUE PM_QOS_LATENCY_ANY #define PM_QOS_RESUME_LATENCY_NO_CONSTRAINT PM_QOS_LATENCY_ANY #define PM_QOS_RESUME_LATENCY_NO_CONSTRAINT_NS PM_QOS_LATENCY_ANY_NS #define PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE 0 #define PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE 0 #define PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE FREQ_QOS_MAX_DEFAULT_VALUE #define PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT (-1) #define PM_QOS_FLAG_NO_POWER_OFF (1 << 0) enum pm_qos_type { PM_QOS_UNITIALIZED, PM_QOS_MAX, /* return the largest value */ PM_QOS_MIN, /* return the smallest value */ }; /* * Note: The lockless read path depends on the CPU accessing target_value * or effective_flags atomically. Atomic access is only guaranteed on all CPU * types linux supports for 32 bit quantites */ struct pm_qos_constraints { struct plist_head list; s32 target_value; /* Do not change to 64 bit */ s32 default_value; s32 no_constraint_value; enum pm_qos_type type; struct blocking_notifier_head *notifiers; }; struct pm_qos_request { struct plist_node node; struct pm_qos_constraints *qos; }; struct pm_qos_flags_request { struct list_head node; s32 flags; /* Do not change to 64 bit */ }; struct pm_qos_flags { struct list_head list; s32 effective_flags; /* Do not change to 64 bit */ }; #define FREQ_QOS_MIN_DEFAULT_VALUE 0 #define FREQ_QOS_MAX_DEFAULT_VALUE S32_MAX enum freq_qos_req_type { FREQ_QOS_MIN = 1, FREQ_QOS_MAX, }; struct freq_constraints { struct pm_qos_constraints min_freq; struct blocking_notifier_head min_freq_notifiers; struct pm_qos_constraints max_freq; struct blocking_notifier_head max_freq_notifiers; }; struct freq_qos_request { enum freq_qos_req_type type; struct plist_node pnode; struct freq_constraints *qos; }; enum dev_pm_qos_req_type { DEV_PM_QOS_RESUME_LATENCY = 1, DEV_PM_QOS_LATENCY_TOLERANCE, DEV_PM_QOS_MIN_FREQUENCY, DEV_PM_QOS_MAX_FREQUENCY, DEV_PM_QOS_FLAGS, }; struct dev_pm_qos_request { enum dev_pm_qos_req_type type; union { struct plist_node pnode; struct pm_qos_flags_request flr; struct freq_qos_request freq; } data; struct device *dev; }; struct dev_pm_qos { struct pm_qos_constraints resume_latency; struct pm_qos_constraints latency_tolerance; struct freq_constraints freq; struct pm_qos_flags flags; struct dev_pm_qos_request *resume_latency_req; struct dev_pm_qos_request *latency_tolerance_req; struct dev_pm_qos_request *flags_req; }; /* Action requested to pm_qos_update_target */ enum pm_qos_req_action { PM_QOS_ADD_REQ, /* Add a new request */ PM_QOS_UPDATE_REQ, /* Update an existing request */ PM_QOS_REMOVE_REQ /* Remove an existing request */ }; static inline int dev_pm_qos_request_active(struct dev_pm_qos_request *req) { return req->dev != NULL; } s32 pm_qos_read_value(struct pm_qos_constraints *c); int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, enum pm_qos_req_action action, int value); bool pm_qos_update_flags(struct pm_qos_flags *pqf, struct pm_qos_flags_request *req, enum pm_qos_req_action action, s32 val); #ifdef CONFIG_CPU_IDLE s32 cpu_latency_qos_limit(void); bool cpu_latency_qos_request_active(struct pm_qos_request *req); void cpu_latency_qos_add_request(struct pm_qos_request *req, s32 value); void cpu_latency_qos_update_request(struct pm_qos_request *req, s32 new_value); void cpu_latency_qos_remove_request(struct pm_qos_request *req); #else static inline s32 cpu_latency_qos_limit(void) { return INT_MAX; } static inline bool cpu_latency_qos_request_active(struct pm_qos_request *req) { return false; } static inline void cpu_latency_qos_add_request(struct pm_qos_request *req, s32 value) {} static inline void cpu_latency_qos_update_request(struct pm_qos_request *req, s32 new_value) {} static inline void cpu_latency_qos_remove_request(struct pm_qos_request *req) {} #endif #ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP s32 cpu_wakeup_latency_qos_limit(void); #else static inline s32 cpu_wakeup_latency_qos_limit(void) { return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; } #endif #ifdef CONFIG_PM enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask); enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask); s32 __dev_pm_qos_resume_latency(struct device *dev); s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type); int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value); int dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value); int dev_pm_qos_remove_request(struct dev_pm_qos_request *req); int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type); int dev_pm_qos_remove_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type); void dev_pm_qos_constraints_init(struct device *dev); void dev_pm_qos_constraints_destroy(struct device *dev); int dev_pm_qos_add_ancestor_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value); int dev_pm_qos_expose_latency_limit(struct device *dev, s32 value); void dev_pm_qos_hide_latency_limit(struct device *dev); int dev_pm_qos_expose_flags(struct device *dev, s32 value); void dev_pm_qos_hide_flags(struct device *dev); int dev_pm_qos_update_flags(struct device *dev, s32 mask, bool set); s32 dev_pm_qos_get_user_latency_tolerance(struct device *dev); int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val); int dev_pm_qos_expose_latency_tolerance(struct device *dev); void dev_pm_qos_hide_latency_tolerance(struct device *dev); static inline s32 dev_pm_qos_requested_resume_latency(struct device *dev) { return dev->power.qos->resume_latency_req->data.pnode.prio; } static inline s32 dev_pm_qos_requested_flags(struct device *dev) { return dev->power.qos->flags_req->data.flr.flags; } static inline s32 dev_pm_qos_raw_resume_latency(struct device *dev) { return IS_ERR_OR_NULL(dev->power.qos) ? PM_QOS_RESUME_LATENCY_NO_CONSTRAINT : pm_qos_read_value(&dev->power.qos->resume_latency); } #else static inline enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask) { return PM_QOS_FLAGS_UNDEFINED; } static inline enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask) { return PM_QOS_FLAGS_UNDEFINED; } static inline s32 __dev_pm_qos_resume_latency(struct device *dev) { return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; } static inline s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type) { switch (type) { case DEV_PM_QOS_RESUME_LATENCY: return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; case DEV_PM_QOS_MIN_FREQUENCY: return PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE; case DEV_PM_QOS_MAX_FREQUENCY: return PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE; default: WARN_ON(1); return 0; } } static inline int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { return 0; } static inline int dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value) { return 0; } static inline int dev_pm_qos_remove_request(struct dev_pm_qos_request *req) { return 0; } static inline int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type) { return 0; } static inline int dev_pm_qos_remove_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type) { return 0; } static inline void dev_pm_qos_constraints_init(struct device *dev) { dev->power.power_state = PMSG_ON; } static inline void dev_pm_qos_constraints_destroy(struct device *dev) { dev->power.power_state = PMSG_INVALID; } static inline int dev_pm_qos_add_ancestor_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { return 0; } static inline int dev_pm_qos_expose_latency_limit(struct device *dev, s32 value) { return 0; } static inline void dev_pm_qos_hide_latency_limit(struct device *dev) {} static inline int dev_pm_qos_expose_flags(struct device *dev, s32 value) { return 0; } static inline void dev_pm_qos_hide_flags(struct device *dev) {} static inline int dev_pm_qos_update_flags(struct device *dev, s32 m, bool set) { return 0; } static inline s32 dev_pm_qos_get_user_latency_tolerance(struct device *dev) { return PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT; } static inline int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val) { return 0; } static inline int dev_pm_qos_expose_latency_tolerance(struct device *dev) { return 0; } static inline void dev_pm_qos_hide_latency_tolerance(struct device *dev) {} static inline s32 dev_pm_qos_requested_resume_latency(struct device *dev) { return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; } static inline s32 dev_pm_qos_requested_flags(struct device *dev) { return 0; } static inline s32 dev_pm_qos_raw_resume_latency(struct device *dev) { return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; } #endif static inline int freq_qos_request_active(struct freq_qos_request *req) { return !IS_ERR_OR_NULL(req->qos); } void freq_constraints_init(struct freq_constraints *qos); s32 freq_qos_read_value(struct freq_constraints *qos, enum freq_qos_req_type type); int freq_qos_add_request(struct freq_constraints *qos, struct freq_qos_request *req, enum freq_qos_req_type type, s32 value); int freq_qos_update_request(struct freq_qos_request *req, s32 new_value); int freq_qos_remove_request(struct freq_qos_request *req); int freq_qos_apply(struct freq_qos_request *req, enum pm_qos_req_action action, s32 value); int freq_qos_add_notifier(struct freq_constraints *qos, enum freq_qos_req_type type, struct notifier_block *notifier); int freq_qos_remove_notifier(struct freq_constraints *qos, enum freq_qos_req_type type, struct notifier_block *notifier); #endif
65 65 64 65 3 29 64 65 64 65 5 17 65 20 25 41 6 15 65 63 2 2 2 2 64 65 65 1 1 9 5 4 9 9 9 5 5 5 2 2 9 3 4 1 1 4 4 4 4 2 4 6 6 6 6 2 3 2 6 3 2 3 3 3 3 2 2 2 2 2 4 2 34 34 34 14 31 33 33 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 /* * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/export.h> #include <net/ipv6.h> #include <net/inet6_hashtables.h> #include <net/addrconf.h> #include "rds.h" #include "loop.h" #define RDS_CONNECTION_HASH_BITS 12 #define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS) #define RDS_CONNECTION_HASH_MASK (RDS_CONNECTION_HASH_ENTRIES - 1) /* converting this to RCU is a chore for another day.. */ static DEFINE_SPINLOCK(rds_conn_lock); static unsigned long rds_conn_count; static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES]; static struct kmem_cache *rds_conn_slab; static struct hlist_head *rds_conn_bucket(const struct in6_addr *laddr, const struct in6_addr *faddr) { static u32 rds6_hash_secret __read_mostly; static u32 rds_hash_secret __read_mostly; __be32 lhash, fhash; u32 hash; net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret)); net_get_random_once(&rds6_hash_secret, sizeof(rds6_hash_secret)); lhash = laddr->s6_addr32[3]; #if IS_ENABLED(CONFIG_IPV6) fhash = (__force __be32)__ipv6_addr_jhash(faddr, rds6_hash_secret); #else fhash = faddr->s6_addr32[3]; #endif hash = __inet_ehashfn(lhash, 0, fhash, 0, rds_hash_secret); return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK]; } #define rds_conn_info_set(var, test, suffix) do { \ if (test) \ var |= RDS_INFO_CONNECTION_FLAG_##suffix; \ } while (0) /* rcu read lock must be held or the connection spinlock */ static struct rds_connection *rds_conn_lookup(struct net *net, struct hlist_head *head, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, u8 tos, int dev_if) { struct rds_connection *conn, *ret = NULL; hlist_for_each_entry_rcu(conn, head, c_hash_node) { if (ipv6_addr_equal(&conn->c_faddr, faddr) && ipv6_addr_equal(&conn->c_laddr, laddr) && conn->c_trans == trans && conn->c_tos == tos && net == rds_conn_net(conn) && conn->c_dev_if == dev_if) { ret = conn; break; } } rdsdebug("returning conn %p for %pI6c -> %pI6c\n", ret, laddr, faddr); return ret; } /* * This is called by transports as they're bringing down a connection. * It clears partial message state so that the transport can start sending * and receiving over this connection again in the future. It is up to * the transport to have serialized this call with its send and recv. */ static void rds_conn_path_reset(struct rds_conn_path *cp) { struct rds_connection *conn = cp->cp_conn; rdsdebug("connection %pI6c to %pI6c reset\n", &conn->c_laddr, &conn->c_faddr); rds_stats_inc(s_conn_reset); rds_send_path_reset(cp); cp->cp_flags = 0; /* Do not clear next_rx_seq here, else we cannot distinguish * retransmitted packets from new packets, and will hand all * of them to the application. That is not consistent with the * reliability guarantees of RDS. */ } static void __rds_conn_path_init(struct rds_connection *conn, struct rds_conn_path *cp, bool is_outgoing) { spin_lock_init(&cp->cp_lock); cp->cp_next_tx_seq = 1; init_waitqueue_head(&cp->cp_waitq); INIT_LIST_HEAD(&cp->cp_send_queue); INIT_LIST_HEAD(&cp->cp_retrans); cp->cp_conn = conn; atomic_set(&cp->cp_state, RDS_CONN_DOWN); cp->cp_send_gen = 0; cp->cp_reconnect_jiffies = 0; cp->cp_conn->c_proposed_version = RDS_PROTOCOL_VERSION; INIT_DELAYED_WORK(&cp->cp_send_w, rds_send_worker); INIT_DELAYED_WORK(&cp->cp_recv_w, rds_recv_worker); INIT_DELAYED_WORK(&cp->cp_conn_w, rds_connect_worker); INIT_WORK(&cp->cp_down_w, rds_shutdown_worker); mutex_init(&cp->cp_cm_lock); cp->cp_flags = 0; } /* * There is only every one 'conn' for a given pair of addresses in the * system at a time. They contain messages to be retransmitted and so * span the lifetime of the actual underlying transport connections. * * For now they are not garbage collected once they're created. They * are torn down as the module is removed, if ever. */ static struct rds_connection *__rds_conn_create(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, gfp_t gfp, u8 tos, int is_outgoing, int dev_if) { struct rds_connection *conn, *parent = NULL; struct hlist_head *head = rds_conn_bucket(laddr, faddr); struct rds_transport *loop_trans; unsigned long flags; int ret, i; int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); rcu_read_lock(); conn = rds_conn_lookup(net, head, laddr, faddr, trans, tos, dev_if); if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && ipv6_addr_equal(laddr, faddr) && !is_outgoing) { /* This is a looped back IB connection, and we're * called by the code handling the incoming connect. * We need a second connection object into which we * can stick the other QP. */ parent = conn; conn = parent->c_passive; } rcu_read_unlock(); if (conn) goto out; conn = kmem_cache_zalloc(rds_conn_slab, gfp); if (!conn) { conn = ERR_PTR(-ENOMEM); goto out; } conn->c_path = kcalloc(npaths, sizeof(struct rds_conn_path), gfp); if (!conn->c_path) { kmem_cache_free(rds_conn_slab, conn); conn = ERR_PTR(-ENOMEM); goto out; } INIT_HLIST_NODE(&conn->c_hash_node); conn->c_laddr = *laddr; conn->c_isv6 = !ipv6_addr_v4mapped(laddr); conn->c_faddr = *faddr; conn->c_dev_if = dev_if; conn->c_tos = tos; #if IS_ENABLED(CONFIG_IPV6) /* If the local address is link local, set c_bound_if to be the * index used for this connection. Otherwise, set it to 0 as * the socket is not bound to an interface. c_bound_if is used * to look up a socket when a packet is received */ if (ipv6_addr_type(laddr) & IPV6_ADDR_LINKLOCAL) conn->c_bound_if = dev_if; else #endif conn->c_bound_if = 0; rds_conn_net_set(conn, net); ret = rds_cong_get_maps(conn); if (ret) { kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); conn = ERR_PTR(ret); goto out; } /* * This is where a connection becomes loopback. If *any* RDS sockets * can bind to the destination address then we'd rather the messages * flow through loopback rather than either transport. */ loop_trans = rds_trans_get_preferred(net, faddr, conn->c_dev_if); if (loop_trans) { rds_trans_put(loop_trans); conn->c_loopback = 1; if (trans->t_prefer_loopback) { if (likely(is_outgoing)) { /* "outgoing" connection to local address. * Protocol says it wants the connection * handled by the loopback transport. * This is what TCP does. */ trans = &rds_loop_transport; } else { /* No transport currently in use * should end up here, but if it * does, reset/destroy the connection. */ kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); conn = ERR_PTR(-EOPNOTSUPP); goto out; } } } conn->c_trans = trans; init_waitqueue_head(&conn->c_hs_waitq); for (i = 0; i < npaths; i++) { __rds_conn_path_init(conn, &conn->c_path[i], is_outgoing); conn->c_path[i].cp_index = i; } rcu_read_lock(); if (rds_destroy_pending(conn)) ret = -ENETDOWN; else ret = trans->conn_alloc(conn, GFP_ATOMIC); if (ret) { rcu_read_unlock(); kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); conn = ERR_PTR(ret); goto out; } rdsdebug("allocated conn %p for %pI6c -> %pI6c over %s %s\n", conn, laddr, faddr, strnlen(trans->t_name, sizeof(trans->t_name)) ? trans->t_name : "[unknown]", is_outgoing ? "(outgoing)" : ""); /* * Since we ran without holding the conn lock, someone could * have created the same conn (either normal or passive) in the * interim. We check while holding the lock. If we won, we complete * init and return our conn. If we lost, we rollback and return the * other one. */ spin_lock_irqsave(&rds_conn_lock, flags); if (parent) { /* Creating passive conn */ if (parent->c_passive) { trans->conn_free(conn->c_path[0].cp_transport_data); kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); conn = parent->c_passive; } else { parent->c_passive = conn; rds_cong_add_conn(conn); rds_conn_count++; } } else { /* Creating normal conn */ struct rds_connection *found; found = rds_conn_lookup(net, head, laddr, faddr, trans, tos, dev_if); if (found) { struct rds_conn_path *cp; int i; for (i = 0; i < npaths; i++) { cp = &conn->c_path[i]; /* The ->conn_alloc invocation may have * allocated resource for all paths, so all * of them may have to be freed here. */ if (cp->cp_transport_data) trans->conn_free(cp->cp_transport_data); } kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); conn = found; } else { conn->c_my_gen_num = rds_gen_num; conn->c_peer_gen_num = 0; hlist_add_head_rcu(&conn->c_hash_node, head); rds_cong_add_conn(conn); rds_conn_count++; } } spin_unlock_irqrestore(&rds_conn_lock, flags); rcu_read_unlock(); out: return conn; } struct rds_connection *rds_conn_create(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, u8 tos, gfp_t gfp, int dev_if) { return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 0, dev_if); } EXPORT_SYMBOL_GPL(rds_conn_create); struct rds_connection *rds_conn_create_outgoing(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, u8 tos, gfp_t gfp, int dev_if) { return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 1, dev_if); } EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); void rds_conn_shutdown(struct rds_conn_path *cp) { struct rds_connection *conn = cp->cp_conn; /* shut it down unless it's down already */ if (!rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_DOWN)) { /* * Quiesce the connection mgmt handlers before we start tearing * things down. We don't hold the mutex for the entire * duration of the shutdown operation, else we may be * deadlocking with the CM handler. Instead, the CM event * handler is supposed to check for state DISCONNECTING */ mutex_lock(&cp->cp_cm_lock); if (!rds_conn_path_transition(cp, RDS_CONN_UP, RDS_CONN_DISCONNECTING) && !rds_conn_path_transition(cp, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) { rds_conn_path_error(cp, "shutdown called in state %d\n", atomic_read(&cp->cp_state)); mutex_unlock(&cp->cp_cm_lock); return; } mutex_unlock(&cp->cp_cm_lock); wait_event(cp->cp_waitq, !test_bit(RDS_IN_XMIT, &cp->cp_flags)); wait_event(cp->cp_waitq, !test_bit(RDS_RECV_REFILL, &cp->cp_flags)); conn->c_trans->conn_path_shutdown(cp); rds_conn_path_reset(cp); if (!rds_conn_path_transition(cp, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN) && !rds_conn_path_transition(cp, RDS_CONN_ERROR, RDS_CONN_DOWN)) { /* This can happen - eg when we're in the middle of tearing * down the connection, and someone unloads the rds module. * Quite reproducible with loopback connections. * Mostly harmless. * * Note that this also happens with rds-tcp because * we could have triggered rds_conn_path_drop in irq * mode from rds_tcp_state change on the receipt of * a FIN, thus we need to recheck for RDS_CONN_ERROR * here. */ rds_conn_path_error(cp, "%s: failed to transition " "to state DOWN, current state " "is %d\n", __func__, atomic_read(&cp->cp_state)); return; } } /* Then reconnect if it's still live. * The passive side of an IB loopback connection is never added * to the conn hash, so we never trigger a reconnect on this * conn - the reconnect is always triggered by the active peer. */ cancel_delayed_work_sync(&cp->cp_conn_w); rcu_read_lock(); if (!hlist_unhashed(&conn->c_hash_node)) { rcu_read_unlock(); rds_queue_reconnect(cp); } else { rcu_read_unlock(); } } /* destroy a single rds_conn_path. rds_conn_destroy() iterates over * all paths using rds_conn_path_destroy() */ static void rds_conn_path_destroy(struct rds_conn_path *cp) { struct rds_message *rm, *rtmp; if (!cp->cp_transport_data) return; /* make sure lingering queued work won't try to ref the conn */ cancel_delayed_work_sync(&cp->cp_send_w); cancel_delayed_work_sync(&cp->cp_recv_w); rds_conn_path_drop(cp, true); flush_work(&cp->cp_down_w); /* tear down queued messages */ list_for_each_entry_safe(rm, rtmp, &cp->cp_send_queue, m_conn_item) { list_del_init(&rm->m_conn_item); BUG_ON(!list_empty(&rm->m_sock_item)); rds_message_put(rm); } if (cp->cp_xmit_rm) rds_message_put(cp->cp_xmit_rm); WARN_ON(delayed_work_pending(&cp->cp_send_w)); WARN_ON(delayed_work_pending(&cp->cp_recv_w)); WARN_ON(delayed_work_pending(&cp->cp_conn_w)); WARN_ON(work_pending(&cp->cp_down_w)); cp->cp_conn->c_trans->conn_free(cp->cp_transport_data); } /* * Stop and free a connection. * * This can only be used in very limited circumstances. It assumes that once * the conn has been shutdown that no one else is referencing the connection. * We can only ensure this in the rmmod path in the current code. */ void rds_conn_destroy(struct rds_connection *conn) { unsigned long flags; int i; struct rds_conn_path *cp; int npaths = (conn->c_trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); rdsdebug("freeing conn %p for %pI4 -> " "%pI4\n", conn, &conn->c_laddr, &conn->c_faddr); /* Ensure conn will not be scheduled for reconnect */ spin_lock_irq(&rds_conn_lock); hlist_del_init_rcu(&conn->c_hash_node); spin_unlock_irq(&rds_conn_lock); synchronize_rcu(); /* shut the connection down */ for (i = 0; i < npaths; i++) { cp = &conn->c_path[i]; rds_conn_path_destroy(cp); BUG_ON(!list_empty(&cp->cp_retrans)); } /* * The congestion maps aren't freed up here. They're * freed by rds_cong_exit() after all the connections * have been freed. */ rds_cong_remove_conn(conn); kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); spin_lock_irqsave(&rds_conn_lock, flags); rds_conn_count--; spin_unlock_irqrestore(&rds_conn_lock, flags); } EXPORT_SYMBOL_GPL(rds_conn_destroy); static void __rds_inc_msg_cp(struct rds_incoming *inc, struct rds_info_iterator *iter, void *saddr, void *daddr, int flip, bool isv6) { #if IS_ENABLED(CONFIG_IPV6) if (isv6) rds6_inc_info_copy(inc, iter, saddr, daddr, flip); else #endif rds_inc_info_copy(inc, iter, *(__be32 *)saddr, *(__be32 *)daddr, flip); } static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int want_send, bool isv6) { struct hlist_head *head; struct list_head *list; struct rds_connection *conn; struct rds_message *rm; unsigned int total = 0; unsigned long flags; size_t i; int j; if (isv6) len /= sizeof(struct rds6_info_message); else len /= sizeof(struct rds_info_message); rcu_read_lock(); for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); i++, head++) { hlist_for_each_entry_rcu(conn, head, c_hash_node) { struct rds_conn_path *cp; int npaths; if (!isv6 && conn->c_isv6) continue; npaths = (conn->c_trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); for (j = 0; j < npaths; j++) { cp = &conn->c_path[j]; if (want_send) list = &cp->cp_send_queue; else list = &cp->cp_retrans; spin_lock_irqsave(&cp->cp_lock, flags); /* XXX too lazy to maintain counts.. */ list_for_each_entry(rm, list, m_conn_item) { total++; if (total <= len) __rds_inc_msg_cp(&rm->m_inc, iter, &conn->c_laddr, &conn->c_faddr, 0, isv6); } spin_unlock_irqrestore(&cp->cp_lock, flags); } } } rcu_read_unlock(); lens->nr = total; if (isv6) lens->each = sizeof(struct rds6_info_message); else lens->each = sizeof(struct rds_info_message); } static void rds_conn_message_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int want_send) { rds_conn_message_info_cmn(sock, len, iter, lens, want_send, false); } #if IS_ENABLED(CONFIG_IPV6) static void rds6_conn_message_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int want_send) { rds_conn_message_info_cmn(sock, len, iter, lens, want_send, true); } #endif static void rds_conn_message_info_send(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { rds_conn_message_info(sock, len, iter, lens, 1); } #if IS_ENABLED(CONFIG_IPV6) static void rds6_conn_message_info_send(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { rds6_conn_message_info(sock, len, iter, lens, 1); } #endif static void rds_conn_message_info_retrans(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { rds_conn_message_info(sock, len, iter, lens, 0); } #if IS_ENABLED(CONFIG_IPV6) static void rds6_conn_message_info_retrans(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { rds6_conn_message_info(sock, len, iter, lens, 0); } #endif void rds_for_each_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int (*visitor)(struct rds_connection *, void *), u64 *buffer, size_t item_len) { struct hlist_head *head; struct rds_connection *conn; size_t i; rcu_read_lock(); lens->nr = 0; lens->each = item_len; for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); i++, head++) { hlist_for_each_entry_rcu(conn, head, c_hash_node) { /* XXX no c_lock usage.. */ if (!visitor(conn, buffer)) continue; /* We copy as much as we can fit in the buffer, * but we count all items so that the caller * can resize the buffer. */ if (len >= item_len) { rds_info_copy(iter, buffer, item_len); len -= item_len; } lens->nr++; } } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rds_for_each_conn_info); static void rds_walk_conn_path_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int (*visitor)(struct rds_conn_path *, void *), u64 *buffer, size_t item_len) { struct hlist_head *head; struct rds_connection *conn; size_t i; rcu_read_lock(); lens->nr = 0; lens->each = item_len; for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); i++, head++) { hlist_for_each_entry_rcu(conn, head, c_hash_node) { struct rds_conn_path *cp; /* XXX We only copy the information from the first * path for now. The problem is that if there are * more than one underlying paths, we cannot report * information of all of them using the existing * API. For example, there is only one next_tx_seq, * which path's next_tx_seq should we report? It is * a bug in the design of MPRDS. */ cp = conn->c_path; /* XXX no cp_lock usage.. */ if (!visitor(cp, buffer)) continue; /* We copy as much as we can fit in the buffer, * but we count all items so that the caller * can resize the buffer. */ if (len >= item_len) { rds_info_copy(iter, buffer, item_len); len -= item_len; } lens->nr++; } } rcu_read_unlock(); } static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer) { struct rds_info_connection *cinfo = buffer; struct rds_connection *conn = cp->cp_conn; if (conn->c_isv6) return 0; cinfo->next_tx_seq = cp->cp_next_tx_seq; cinfo->next_rx_seq = cp->cp_next_rx_seq; cinfo->laddr = conn->c_laddr.s6_addr32[3]; cinfo->faddr = conn->c_faddr.s6_addr32[3]; cinfo->tos = conn->c_tos; strscpy_pad(cinfo->transport, conn->c_trans->t_name); cinfo->flags = 0; rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags), SENDING); /* XXX Future: return the state rather than these funky bits */ rds_conn_info_set(cinfo->flags, atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING, CONNECTING); rds_conn_info_set(cinfo->flags, atomic_read(&cp->cp_state) == RDS_CONN_UP, CONNECTED); return 1; } #if IS_ENABLED(CONFIG_IPV6) static int rds6_conn_info_visitor(struct rds_conn_path *cp, void *buffer) { struct rds6_info_connection *cinfo6 = buffer; struct rds_connection *conn = cp->cp_conn; cinfo6->next_tx_seq = cp->cp_next_tx_seq; cinfo6->next_rx_seq = cp->cp_next_rx_seq; cinfo6->laddr = conn->c_laddr; cinfo6->faddr = conn->c_faddr; strscpy_pad(cinfo6->transport, conn->c_trans->t_name); cinfo6->flags = 0; rds_conn_info_set(cinfo6->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags), SENDING); /* XXX Future: return the state rather than these funky bits */ rds_conn_info_set(cinfo6->flags, atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING, CONNECTING); rds_conn_info_set(cinfo6->flags, atomic_read(&cp->cp_state) == RDS_CONN_UP, CONNECTED); /* Just return 1 as there is no error case. This is a helper function * for rds_walk_conn_path_info() and it wants a return value. */ return 1; } #endif static void rds_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { u64 buffer[(sizeof(struct rds_info_connection) + 7) / 8]; rds_walk_conn_path_info(sock, len, iter, lens, rds_conn_info_visitor, buffer, sizeof(struct rds_info_connection)); } #if IS_ENABLED(CONFIG_IPV6) static void rds6_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { u64 buffer[(sizeof(struct rds6_info_connection) + 7) / 8]; rds_walk_conn_path_info(sock, len, iter, lens, rds6_conn_info_visitor, buffer, sizeof(struct rds6_info_connection)); } #endif int rds_conn_init(void) { int ret; ret = rds_loop_net_init(); /* register pernet callback */ if (ret) return ret; rds_conn_slab = KMEM_CACHE(rds_connection, 0); if (!rds_conn_slab) { rds_loop_net_exit(); return -ENOMEM; } rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info); rds_info_register_func(RDS_INFO_SEND_MESSAGES, rds_conn_message_info_send); rds_info_register_func(RDS_INFO_RETRANS_MESSAGES, rds_conn_message_info_retrans); #if IS_ENABLED(CONFIG_IPV6) rds_info_register_func(RDS6_INFO_CONNECTIONS, rds6_conn_info); rds_info_register_func(RDS6_INFO_SEND_MESSAGES, rds6_conn_message_info_send); rds_info_register_func(RDS6_INFO_RETRANS_MESSAGES, rds6_conn_message_info_retrans); #endif return 0; } void rds_conn_exit(void) { rds_loop_net_exit(); /* unregister pernet callback */ rds_loop_exit(); WARN_ON(!hlist_empty(rds_conn_hash)); kmem_cache_destroy(rds_conn_slab); rds_info_deregister_func(RDS_INFO_CONNECTIONS, rds_conn_info); rds_info_deregister_func(RDS_INFO_SEND_MESSAGES, rds_conn_message_info_send); rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES, rds_conn_message_info_retrans); #if IS_ENABLED(CONFIG_IPV6) rds_info_deregister_func(RDS6_INFO_CONNECTIONS, rds6_conn_info); rds_info_deregister_func(RDS6_INFO_SEND_MESSAGES, rds6_conn_message_info_send); rds_info_deregister_func(RDS6_INFO_RETRANS_MESSAGES, rds6_conn_message_info_retrans); #endif } /* * Force a disconnect */ void rds_conn_path_drop(struct rds_conn_path *cp, bool destroy) { atomic_set(&cp->cp_state, RDS_CONN_ERROR); rcu_read_lock(); if (!destroy && rds_destroy_pending(cp->cp_conn)) { rcu_read_unlock(); return; } queue_work(rds_wq, &cp->cp_down_w); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rds_conn_path_drop); void rds_conn_drop(struct rds_connection *conn) { WARN_ON(conn->c_trans->t_mp_capable); rds_conn_path_drop(&conn->c_path[0], false); } EXPORT_SYMBOL_GPL(rds_conn_drop); /* * If the connection is down, trigger a connect. We may have scheduled a * delayed reconnect however - in this case we should not interfere. */ void rds_conn_path_connect_if_down(struct rds_conn_path *cp) { rcu_read_lock(); if (rds_destroy_pending(cp->cp_conn)) { rcu_read_unlock(); return; } if (rds_conn_path_state(cp) == RDS_CONN_DOWN && !test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags)) queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down); /* Check connectivity of all paths */ void rds_check_all_paths(struct rds_connection *conn) { int i = 0; do { rds_conn_path_connect_if_down(&conn->c_path[i]); } while (++i < conn->c_npaths); } void rds_conn_connect_if_down(struct rds_connection *conn) { WARN_ON(conn->c_trans->t_mp_capable); rds_conn_path_connect_if_down(&conn->c_path[0]); } EXPORT_SYMBOL_GPL(rds_conn_connect_if_down); void __rds_conn_path_error(struct rds_conn_path *cp, const char *fmt, ...) { va_list ap; va_start(ap, fmt); vprintk(fmt, ap); va_end(ap); rds_conn_path_drop(cp, false); }
46 10 4 47 6 3 3 3 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 /* SPDX-License-Identifier: GPL-2.0-only */ #undef TRACE_SYSTEM #define TRACE_SYSTEM l2tp #if !defined(_TRACE_L2TP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_L2TP_H #include <linux/tracepoint.h> #include <linux/l2tp.h> #include "l2tp_core.h" #define encap_type_name(e) { L2TP_ENCAPTYPE_##e, #e } #define show_encap_type_name(val) \ __print_symbolic(val, \ encap_type_name(UDP), \ encap_type_name(IP)) #define pw_type_name(p) { L2TP_PWTYPE_##p, #p } #define show_pw_type_name(val) \ __print_symbolic(val, \ pw_type_name(ETH_VLAN), \ pw_type_name(ETH), \ pw_type_name(PPP), \ pw_type_name(PPP_AC), \ pw_type_name(IP)) DECLARE_EVENT_CLASS(tunnel_only_evt, TP_PROTO(struct l2tp_tunnel *tunnel), TP_ARGS(tunnel), TP_STRUCT__entry( __array(char, name, L2TP_TUNNEL_NAME_MAX) ), TP_fast_assign( memcpy(__entry->name, tunnel->name, L2TP_TUNNEL_NAME_MAX); ), TP_printk("%s", __entry->name) ); DECLARE_EVENT_CLASS(session_only_evt, TP_PROTO(struct l2tp_session *session), TP_ARGS(session), TP_STRUCT__entry( __array(char, name, L2TP_SESSION_NAME_MAX) ), TP_fast_assign( memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX); ), TP_printk("%s", __entry->name) ); TRACE_EVENT(register_tunnel, TP_PROTO(struct l2tp_tunnel *tunnel), TP_ARGS(tunnel), TP_STRUCT__entry( __array(char, name, L2TP_TUNNEL_NAME_MAX) __field(int, fd) __field(u32, tid) __field(u32, ptid) __field(int, version) __field(enum l2tp_encap_type, encap) ), TP_fast_assign( memcpy(__entry->name, tunnel->name, L2TP_TUNNEL_NAME_MAX); __entry->fd = tunnel->fd; __entry->tid = tunnel->tunnel_id; __entry->ptid = tunnel->peer_tunnel_id; __entry->version = tunnel->version; __entry->encap = tunnel->encap; ), TP_printk("%s: type=%s encap=%s version=L2TPv%d tid=%u ptid=%u fd=%d", __entry->name, __entry->fd > 0 ? "managed" : "unmanaged", show_encap_type_name(__entry->encap), __entry->version, __entry->tid, __entry->ptid, __entry->fd) ); DEFINE_EVENT(tunnel_only_evt, delete_tunnel, TP_PROTO(struct l2tp_tunnel *tunnel), TP_ARGS(tunnel) ); DEFINE_EVENT(tunnel_only_evt, free_tunnel, TP_PROTO(struct l2tp_tunnel *tunnel), TP_ARGS(tunnel) ); TRACE_EVENT(register_session, TP_PROTO(struct l2tp_session *session), TP_ARGS(session), TP_STRUCT__entry( __array(char, name, L2TP_SESSION_NAME_MAX) __field(u32, tid) __field(u32, ptid) __field(u32, sid) __field(u32, psid) __field(enum l2tp_pwtype, pwtype) ), TP_fast_assign( memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX); __entry->tid = session->tunnel ? session->tunnel->tunnel_id : 0; __entry->ptid = session->tunnel ? session->tunnel->peer_tunnel_id : 0; __entry->sid = session->session_id; __entry->psid = session->peer_session_id; __entry->pwtype = session->pwtype; ), TP_printk("%s: pseudowire=%s sid=%u psid=%u tid=%u ptid=%u", __entry->name, show_pw_type_name(__entry->pwtype), __entry->sid, __entry->psid, __entry->sid, __entry->psid) ); DEFINE_EVENT(session_only_evt, delete_session, TP_PROTO(struct l2tp_session *session), TP_ARGS(session) ); DEFINE_EVENT(session_only_evt, free_session, TP_PROTO(struct l2tp_session *session), TP_ARGS(session) ); DEFINE_EVENT(session_only_evt, session_seqnum_lns_enable, TP_PROTO(struct l2tp_session *session), TP_ARGS(session) ); DEFINE_EVENT(session_only_evt, session_seqnum_lns_disable, TP_PROTO(struct l2tp_session *session), TP_ARGS(session) ); DECLARE_EVENT_CLASS(session_seqnum_evt, TP_PROTO(struct l2tp_session *session), TP_ARGS(session), TP_STRUCT__entry( __array(char, name, L2TP_SESSION_NAME_MAX) __field(u32, ns) __field(u32, nr) ), TP_fast_assign( memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX); __entry->ns = session->ns; __entry->nr = session->nr; ), TP_printk("%s: ns=%u nr=%u", __entry->name, __entry->ns, __entry->nr) ); DEFINE_EVENT(session_seqnum_evt, session_seqnum_update, TP_PROTO(struct l2tp_session *session), TP_ARGS(session) ); DEFINE_EVENT(session_seqnum_evt, session_seqnum_reset, TP_PROTO(struct l2tp_session *session), TP_ARGS(session) ); DECLARE_EVENT_CLASS(session_pkt_discard_evt, TP_PROTO(struct l2tp_session *session, u32 pkt_ns), TP_ARGS(session, pkt_ns), TP_STRUCT__entry( __array(char, name, L2TP_SESSION_NAME_MAX) __field(u32, pkt_ns) __field(u32, my_nr) __field(u32, reorder_q_len) ), TP_fast_assign( memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX); __entry->pkt_ns = pkt_ns, __entry->my_nr = session->nr; __entry->reorder_q_len = skb_queue_len(&session->reorder_q); ), TP_printk("%s: pkt_ns=%u my_nr=%u reorder_q_len=%u", __entry->name, __entry->pkt_ns, __entry->my_nr, __entry->reorder_q_len) ); DEFINE_EVENT(session_pkt_discard_evt, session_pkt_expired, TP_PROTO(struct l2tp_session *session, u32 pkt_ns), TP_ARGS(session, pkt_ns) ); DEFINE_EVENT(session_pkt_discard_evt, session_pkt_outside_rx_window, TP_PROTO(struct l2tp_session *session, u32 pkt_ns), TP_ARGS(session, pkt_ns) ); DEFINE_EVENT(session_pkt_discard_evt, session_pkt_oos, TP_PROTO(struct l2tp_session *session, u32 pkt_ns), TP_ARGS(session, pkt_ns) ); #endif /* _TRACE_L2TP_H */ /* This part must be outside protection */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace #include <trace/define_trace.h>
454 74 444 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_CONNTRACK_TIMEOUT_H #define _NF_CONNTRACK_TIMEOUT_H #include <net/net_namespace.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_conntrack_tuple_common.h> #include <linux/refcount.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_extend.h> #define CTNL_TIMEOUT_NAME_MAX 32 struct nf_ct_timeout { __u16 l3num; const struct nf_conntrack_l4proto *l4proto; char data[]; }; struct nf_conn_timeout { struct nf_ct_timeout __rcu *timeout; }; static inline unsigned int * nf_ct_timeout_data(const struct nf_conn_timeout *t) { #ifdef CONFIG_NF_CONNTRACK_TIMEOUT struct nf_ct_timeout *timeout; timeout = rcu_dereference(t->timeout); if (timeout == NULL) return NULL; return (unsigned int *)timeout->data; #else return NULL; #endif } static inline struct nf_conn_timeout *nf_ct_timeout_find(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_TIMEOUT return nf_ct_ext_find(ct, NF_CT_EXT_TIMEOUT); #else return NULL; #endif } static inline struct nf_conn_timeout *nf_ct_timeout_ext_add(struct nf_conn *ct, struct nf_ct_timeout *timeout, gfp_t gfp) { #ifdef CONFIG_NF_CONNTRACK_TIMEOUT struct nf_conn_timeout *timeout_ext; timeout_ext = nf_ct_ext_add(ct, NF_CT_EXT_TIMEOUT, gfp); if (timeout_ext == NULL) return NULL; rcu_assign_pointer(timeout_ext->timeout, timeout); return timeout_ext; #else return NULL; #endif }; static inline unsigned int *nf_ct_timeout_lookup(const struct nf_conn *ct) { unsigned int *timeouts = NULL; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT struct nf_conn_timeout *timeout_ext; timeout_ext = nf_ct_timeout_find(ct); if (timeout_ext) timeouts = nf_ct_timeout_data(timeout_ext); #endif return timeouts; } #ifdef CONFIG_NF_CONNTRACK_TIMEOUT void nf_ct_untimeout(struct net *net, struct nf_ct_timeout *timeout); int nf_ct_set_timeout(struct net *net, struct nf_conn *ct, u8 l3num, u8 l4num, const char *timeout_name); void nf_ct_destroy_timeout(struct nf_conn *ct); #else static inline int nf_ct_set_timeout(struct net *net, struct nf_conn *ct, u8 l3num, u8 l4num, const char *timeout_name) { return -EOPNOTSUPP; } static inline void nf_ct_destroy_timeout(struct nf_conn *ct) { return; } #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ #ifdef CONFIG_NF_CONNTRACK_TIMEOUT struct nf_ct_timeout_hooks { struct nf_ct_timeout *(*timeout_find_get)(struct net *net, const char *name); void (*timeout_put)(struct nf_ct_timeout *timeout); }; extern const struct nf_ct_timeout_hooks __rcu *nf_ct_timeout_hook; #endif #endif /* _NF_CONNTRACK_TIMEOUT_H */
2209 1 1 693 692 694 694 6480 6480 6497 4016 3011 689 694 694 608 596 5026 3011 693 694 48 48 48 6479 6491 11 6479 368 368 598 596 596 601 600 600 598 596 596 596 33 33 33 693 694 694 694 694 3 94 568 568 568 567 201 200 282 282 280 217 216 186 31 217 1066 799 282 60 61 184 33 2 2 2 2 2 2 686 687 687 7 2 5 211 20 192 9 95 11 12 119 9 117 6 21 21 362 2 681 316 364 683 1 345 339 684 31 332 322 681 315 352 14 70 70 683 315 366 582 580 2 562 15 5 683 101 69 1 32 32 5 1 96 581 582 714 1 1 712 378 334 1 318 377 7 694 377 317 405 285 662 31 693 692 694 692 10 685 3 3 684 682 681 3 1 2 7 663 65 6 633 27 576 1 277 316 593 315 277 15 266 315 315 24 242 582 230 40 266 267 315 582 568 12 15 56 89 11 97 3 33 37 37 36 38 55 303 380 373 7 55 28 300 48 267 229 37 244 24 267 267 244 21 21 295 108 1 106 1 1 1 81 20 102 33 89 2 2 4 2 98 97 110 11 101 4 4 3 2 4 2 1 147 1 147 147 51 12 47 104 149 149 147 147 8 141 7 41 92 80 61 10 130 58 83 42 98 138 145 46 102 143 4 147 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/fork.c * * Copyright (C) 1991, 1992 Linus Torvalds */ /* * 'fork.c' contains the help-routines for the 'fork' system call * (see also entry.S and others). * Fork is rather simple, once you get the hang of it, but the memory * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' */ #include <linux/anon_inodes.h> #include <linux/slab.h> #include <linux/sched/autogroup.h> #include <linux/sched/mm.h> #include <linux/sched/user.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/stat.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> #include <linux/sched/ext.h> #include <linux/seq_file.h> #include <linux/rtmutex.h> #include <linux/init.h> #include <linux/unistd.h> #include <linux/module.h> #include <linux/vmalloc.h> #include <linux/completion.h> #include <linux/personality.h> #include <linux/mempolicy.h> #include <linux/sem.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/iocontext.h> #include <linux/key.h> #include <linux/kmsan.h> #include <linux/binfmts.h> #include <linux/mman.h> #include <linux/mmu_notifier.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/memblock.h> #include <linux/nsproxy.h> #include <linux/capability.h> #include <linux/cpu.h> #include <linux/cgroup.h> #include <linux/security.h> #include <linux/hugetlb.h> #include <linux/seccomp.h> #include <linux/swap.h> #include <linux/syscalls.h> #include <linux/syscall_user_dispatch.h> #include <linux/jiffies.h> #include <linux/futex.h> #include <linux/compat.h> #include <linux/kthread.h> #include <linux/task_io_accounting_ops.h> #include <linux/rcupdate.h> #include <linux/ptrace.h> #include <linux/mount.h> #include <linux/audit.h> #include <linux/memcontrol.h> #include <linux/ftrace.h> #include <linux/proc_fs.h> #include <linux/profile.h> #include <linux/rmap.h> #include <linux/ksm.h> #include <linux/acct.h> #include <linux/userfaultfd_k.h> #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> #include <linux/freezer.h> #include <linux/delayacct.h> #include <linux/taskstats_kern.h> #include <linux/tty.h> #include <linux/fs_struct.h> #include <linux/magic.h> #include <linux/perf_event.h> #include <linux/posix-timers.h> #include <linux/user-return-notifier.h> #include <linux/oom.h> #include <linux/khugepaged.h> #include <linux/signalfd.h> #include <linux/uprobes.h> #include <linux/aio.h> #include <linux/compiler.h> #include <linux/sysctl.h> #include <linux/kcov.h> #include <linux/livepatch.h> #include <linux/thread_info.h> #include <linux/kstack_erase.h> #include <linux/kasan.h> #include <linux/scs.h> #include <linux/io_uring.h> #include <linux/bpf.h> #include <linux/stackprotector.h> #include <linux/user_events.h> #include <linux/iommu.h> #include <linux/rseq.h> #include <uapi/linux/pidfd.h> #include <linux/pidfs.h> #include <linux/tick.h> #include <linux/unwind_deferred.h> #include <linux/pgalloc.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> /* For dup_mmap(). */ #include "../mm/internal.h" #include <trace/events/sched.h> #define CREATE_TRACE_POINTS #include <trace/events/task.h> #include <kunit/visibility.h> /* * Minimum number of threads to boot the kernel */ #define MIN_THREADS 20 /* * Maximum number of threads */ #define MAX_THREADS FUTEX_TID_MASK /* * Protected counters by write_lock_irq(&tasklist_lock) */ unsigned long total_forks; /* Handle normal Linux uptimes. */ int nr_threads; /* The idle threads do not count.. */ static int max_threads; /* tunable limit on nr_threads */ #define NAMED_ARRAY_INDEX(x) [x] = __stringify(x) static const char * const resident_page_types[] = { NAMED_ARRAY_INDEX(MM_FILEPAGES), NAMED_ARRAY_INDEX(MM_ANONPAGES), NAMED_ARRAY_INDEX(MM_SWAPENTS), NAMED_ARRAY_INDEX(MM_SHMEMPAGES), }; DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ #ifdef CONFIG_PROVE_RCU int lockdep_tasklist_lock_is_held(void) { return lockdep_is_held(&tasklist_lock); } EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held); #endif /* #ifdef CONFIG_PROVE_RCU */ int nr_processes(void) { int cpu; int total = 0; for_each_possible_cpu(cpu) total += per_cpu(process_counts, cpu); return total; } void __weak arch_release_task_struct(struct task_struct *tsk) { } static struct kmem_cache *task_struct_cachep; static inline struct task_struct *alloc_task_struct_node(int node) { return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node); } static inline void free_task_struct(struct task_struct *tsk) { kmem_cache_free(task_struct_cachep, tsk); } #ifdef CONFIG_VMAP_STACK /* * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB * flush. Try to minimize the number of calls by caching stacks. */ #define NR_CACHED_STACKS 2 static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); /* * Allocated stacks are cached and later reused by new threads, so memcg * accounting is performed by the code assigning/releasing stacks to tasks. * We need a zeroed memory without __GFP_ACCOUNT. */ #define GFP_VMAP_STACK (GFP_KERNEL | __GFP_ZERO) struct vm_stack { struct rcu_head rcu; struct vm_struct *stack_vm_area; }; static struct vm_struct *alloc_thread_stack_node_from_cache(struct task_struct *tsk, int node) { struct vm_struct *vm_area; unsigned int i; /* * If the node has memory, we are guaranteed the stacks are backed by local pages. * Otherwise the pages are arbitrary. * * Note that depending on cpuset it is possible we will get migrated to a different * node immediately after allocating here, so this does *not* guarantee locality for * arbitrary callers. */ scoped_guard(preempt) { if (node != NUMA_NO_NODE && numa_node_id() != node) return NULL; for (i = 0; i < NR_CACHED_STACKS; i++) { vm_area = this_cpu_xchg(cached_stacks[i], NULL); if (vm_area) return vm_area; } } return NULL; } static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area) { unsigned int i; int nid; /* * Don't cache stacks if any of the pages don't match the local domain, unless * there is no local memory to begin with. * * Note that lack of local memory does not automatically mean it makes no difference * performance-wise which other domain backs the stack. In this case we are merely * trying to avoid constantly going to vmalloc. */ scoped_guard(preempt) { nid = numa_node_id(); if (node_state(nid, N_MEMORY)) { for (i = 0; i < vm_area->nr_pages; i++) { struct page *page = vm_area->pages[i]; if (page_to_nid(page) != nid) return false; } } for (i = 0; i < NR_CACHED_STACKS; i++) { struct vm_struct *tmp = NULL; if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area)) return true; } } return false; } static void thread_stack_free_rcu(struct rcu_head *rh) { struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu); struct vm_struct *vm_area = vm_stack->stack_vm_area; if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area)) return; vfree(vm_area->addr); } static void thread_stack_delayed_free(struct task_struct *tsk) { struct vm_stack *vm_stack = tsk->stack; vm_stack->stack_vm_area = tsk->stack_vm_area; call_rcu(&vm_stack->rcu, thread_stack_free_rcu); } static int free_vm_stack_cache(unsigned int cpu) { struct vm_struct **cached_vm_stack_areas = per_cpu_ptr(cached_stacks, cpu); int i; for (i = 0; i < NR_CACHED_STACKS; i++) { struct vm_struct *vm_area = cached_vm_stack_areas[i]; if (!vm_area) continue; vfree(vm_area->addr); cached_vm_stack_areas[i] = NULL; } return 0; } static int memcg_charge_kernel_stack(struct vm_struct *vm_area) { int i; int ret; int nr_charged = 0; BUG_ON(vm_area->nr_pages != THREAD_SIZE / PAGE_SIZE); for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { ret = memcg_kmem_charge_page(vm_area->pages[i], GFP_KERNEL, 0); if (ret) goto err; nr_charged++; } return 0; err: for (i = 0; i < nr_charged; i++) memcg_kmem_uncharge_page(vm_area->pages[i], 0); return ret; } static int alloc_thread_stack_node(struct task_struct *tsk, int node) { struct vm_struct *vm_area; void *stack; vm_area = alloc_thread_stack_node_from_cache(tsk, node); if (vm_area) { if (memcg_charge_kernel_stack(vm_area)) { vfree(vm_area->addr); return -ENOMEM; } /* Reset stack metadata. */ kasan_unpoison_range(vm_area->addr, THREAD_SIZE); stack = kasan_reset_tag(vm_area->addr); /* Clear stale pointers from reused stack. */ memset(stack, 0, THREAD_SIZE); tsk->stack_vm_area = vm_area; tsk->stack = stack; return 0; } stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN, GFP_VMAP_STACK, node, __builtin_return_address(0)); if (!stack) return -ENOMEM; vm_area = find_vm_area(stack); if (memcg_charge_kernel_stack(vm_area)) { vfree(stack); return -ENOMEM; } /* * We can't call find_vm_area() in interrupt context, and * free_thread_stack() can be called in interrupt context, * so cache the vm_struct. */ tsk->stack_vm_area = vm_area; stack = kasan_reset_tag(stack); tsk->stack = stack; return 0; } static void free_thread_stack(struct task_struct *tsk) { if (!try_release_thread_stack_to_cache(tsk->stack_vm_area)) thread_stack_delayed_free(tsk); tsk->stack = NULL; tsk->stack_vm_area = NULL; } #else /* !CONFIG_VMAP_STACK */ /* * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a * kmemcache based allocator. */ #if THREAD_SIZE >= PAGE_SIZE static void thread_stack_free_rcu(struct rcu_head *rh) { __free_pages(virt_to_page(rh), THREAD_SIZE_ORDER); } static void thread_stack_delayed_free(struct task_struct *tsk) { struct rcu_head *rh = tsk->stack; call_rcu(rh, thread_stack_free_rcu); } static int alloc_thread_stack_node(struct task_struct *tsk, int node) { struct page *page = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); if (likely(page)) { tsk->stack = kasan_reset_tag(page_address(page)); return 0; } return -ENOMEM; } static void free_thread_stack(struct task_struct *tsk) { thread_stack_delayed_free(tsk); tsk->stack = NULL; } #else /* !(THREAD_SIZE >= PAGE_SIZE) */ static struct kmem_cache *thread_stack_cache; static void thread_stack_free_rcu(struct rcu_head *rh) { kmem_cache_free(thread_stack_cache, rh); } static void thread_stack_delayed_free(struct task_struct *tsk) { struct rcu_head *rh = tsk->stack; call_rcu(rh, thread_stack_free_rcu); } static int alloc_thread_stack_node(struct task_struct *tsk, int node) { unsigned long *stack; stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); stack = kasan_reset_tag(stack); tsk->stack = stack; return stack ? 0 : -ENOMEM; } static void free_thread_stack(struct task_struct *tsk) { thread_stack_delayed_free(tsk); tsk->stack = NULL; } void thread_stack_cache_init(void) { thread_stack_cache = kmem_cache_create_usercopy("thread_stack", THREAD_SIZE, THREAD_SIZE, 0, 0, THREAD_SIZE, NULL); BUG_ON(thread_stack_cache == NULL); } #endif /* THREAD_SIZE >= PAGE_SIZE */ #endif /* CONFIG_VMAP_STACK */ /* SLAB cache for signal_struct structures (tsk->signal) */ static struct kmem_cache *signal_cachep; /* SLAB cache for sighand_struct structures (tsk->sighand) */ struct kmem_cache *sighand_cachep; /* SLAB cache for files_struct structures (tsk->files) */ struct kmem_cache *files_cachep; /* SLAB cache for fs_struct structures (tsk->fs) */ struct kmem_cache *fs_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; static void account_kernel_stack(struct task_struct *tsk, int account) { if (IS_ENABLED(CONFIG_VMAP_STACK)) { struct vm_struct *vm_area = task_stack_vm_area(tsk); int i; for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) mod_lruvec_page_state(vm_area->pages[i], NR_KERNEL_STACK_KB, account * (PAGE_SIZE / 1024)); } else { void *stack = task_stack_page(tsk); /* All stack pages are in the same node. */ mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB, account * (THREAD_SIZE / 1024)); } } void exit_task_stack_account(struct task_struct *tsk) { account_kernel_stack(tsk, -1); if (IS_ENABLED(CONFIG_VMAP_STACK)) { struct vm_struct *vm_area; int i; vm_area = task_stack_vm_area(tsk); for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) memcg_kmem_uncharge_page(vm_area->pages[i], 0); } } static void release_task_stack(struct task_struct *tsk) { if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD)) return; /* Better to leak the stack than to free prematurely */ free_thread_stack(tsk); } #ifdef CONFIG_THREAD_INFO_IN_TASK void put_task_stack(struct task_struct *tsk) { if (refcount_dec_and_test(&tsk->stack_refcount)) release_task_stack(tsk); } #endif void free_task(struct task_struct *tsk) { #ifdef CONFIG_SECCOMP WARN_ON_ONCE(tsk->seccomp.filter); #endif release_user_cpus_ptr(tsk); scs_release(tsk); #ifndef CONFIG_THREAD_INFO_IN_TASK /* * The task is finally done with both the stack and thread_info, * so free both. */ release_task_stack(tsk); #else /* * If the task had a separate stack allocation, it should be gone * by now. */ WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0); #endif rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); arch_release_task_struct(tsk); if (tsk->flags & PF_KTHREAD) free_kthread_struct(tsk); bpf_task_storage_free(tsk); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) { struct file *exe_file; exe_file = get_mm_exe_file(oldmm); RCU_INIT_POINTER(mm->exe_file, exe_file); /* * We depend on the oldmm having properly denied write access to the * exe_file already. */ if (exe_file && exe_file_deny_write_access(exe_file)) pr_warn_once("exe_file_deny_write_access() failed in %s\n", __func__); } #ifdef CONFIG_MMU static inline int mm_alloc_pgd(struct mm_struct *mm) { mm->pgd = pgd_alloc(mm); if (unlikely(!mm->pgd)) return -ENOMEM; return 0; } static inline void mm_free_pgd(struct mm_struct *mm) { pgd_free(mm, mm->pgd); } #else #define mm_alloc_pgd(mm) (0) #define mm_free_pgd(mm) #endif /* CONFIG_MMU */ #ifdef CONFIG_MM_ID static DEFINE_IDA(mm_ida); static inline int mm_alloc_id(struct mm_struct *mm) { int ret; ret = ida_alloc_range(&mm_ida, MM_ID_MIN, MM_ID_MAX, GFP_KERNEL); if (ret < 0) return ret; mm->mm_id = ret; return 0; } static inline void mm_free_id(struct mm_struct *mm) { const mm_id_t id = mm->mm_id; mm->mm_id = MM_ID_DUMMY; if (id == MM_ID_DUMMY) return; if (WARN_ON_ONCE(id < MM_ID_MIN || id > MM_ID_MAX)) return; ida_free(&mm_ida, id); } #else /* !CONFIG_MM_ID */ static inline int mm_alloc_id(struct mm_struct *mm) { return 0; } static inline void mm_free_id(struct mm_struct *mm) {} #endif /* CONFIG_MM_ID */ static void check_mm(struct mm_struct *mm) { int i; BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS, "Please make sure 'struct resident_page_types[]' is updated as well"); for (i = 0; i < NR_MM_COUNTERS; i++) { long x = percpu_counter_sum(&mm->rss_stat[i]); if (unlikely(x)) { pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld Comm:%s Pid:%d\n", mm, resident_page_types[i], x, current->comm, task_pid_nr(current)); } } if (mm_pgtables_bytes(mm)) pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", mm_pgtables_bytes(mm)); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS) VM_BUG_ON_MM(mm->pmd_huge_pte, mm); #endif } #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) static void do_check_lazy_tlb(void *arg) { struct mm_struct *mm = arg; WARN_ON_ONCE(current->active_mm == mm); } static void do_shoot_lazy_tlb(void *arg) { struct mm_struct *mm = arg; if (current->active_mm == mm) { WARN_ON_ONCE(current->mm); current->active_mm = &init_mm; switch_mm(mm, &init_mm, current); } } static void cleanup_lazy_tlbs(struct mm_struct *mm) { if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) { /* * In this case, lazy tlb mms are refounted and would not reach * __mmdrop until all CPUs have switched away and mmdrop()ed. */ return; } /* * Lazy mm shootdown does not refcount "lazy tlb mm" usage, rather it * requires lazy mm users to switch to another mm when the refcount * drops to zero, before the mm is freed. This requires IPIs here to * switch kernel threads to init_mm. * * archs that use IPIs to flush TLBs can piggy-back that lazy tlb mm * switch with the final userspace teardown TLB flush which leaves the * mm lazy on this CPU but no others, reducing the need for additional * IPIs here. There are cases where a final IPI is still required here, * such as the final mmdrop being performed on a different CPU than the * one exiting, or kernel threads using the mm when userspace exits. * * IPI overheads have not found to be expensive, but they could be * reduced in a number of possible ways, for example (roughly * increasing order of complexity): * - The last lazy reference created by exit_mm() could instead switch * to init_mm, however it's probable this will run on the same CPU * immediately afterwards, so this may not reduce IPIs much. * - A batch of mms requiring IPIs could be gathered and freed at once. * - CPUs store active_mm where it can be remotely checked without a * lock, to filter out false-positives in the cpumask. * - After mm_users or mm_count reaches zero, switching away from the * mm could clear mm_cpumask to reduce some IPIs, perhaps together * with some batching or delaying of the final IPIs. * - A delayed freeing and RCU-like quiescing sequence based on mm * switching to avoid IPIs completely. */ on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1); if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES)) on_each_cpu(do_check_lazy_tlb, (void *)mm, 1); } /* * Called when the last reference to the mm * is dropped: either by a lazy thread or by * mmput. Free the page directory and the mm. */ void __mmdrop(struct mm_struct *mm) { BUG_ON(mm == &init_mm); WARN_ON_ONCE(mm == current->mm); /* Ensure no CPUs are using this as their lazy tlb mm */ cleanup_lazy_tlbs(mm); WARN_ON_ONCE(mm == current->active_mm); mm_free_pgd(mm); mm_free_id(mm); destroy_context(mm); mmu_notifier_subscriptions_destroy(mm); check_mm(mm); put_user_ns(mm->user_ns); mm_pasid_drop(mm); mm_destroy_cid(mm); percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); static void mmdrop_async_fn(struct work_struct *work) { struct mm_struct *mm; mm = container_of(work, struct mm_struct, async_put_work); __mmdrop(mm); } static void mmdrop_async(struct mm_struct *mm) { if (unlikely(atomic_dec_and_test(&mm->mm_count))) { INIT_WORK(&mm->async_put_work, mmdrop_async_fn); schedule_work(&mm->async_put_work); } } static inline void free_signal_struct(struct signal_struct *sig) { taskstats_tgid_free(sig); sched_autogroup_exit(sig); /* * __mmdrop is not safe to call from softirq context on x86 due to * pgd_dtor so postpone it to the async context */ if (sig->oom_mm) mmdrop_async(sig->oom_mm); kmem_cache_free(signal_cachep, sig); } static inline void put_signal_struct(struct signal_struct *sig) { if (refcount_dec_and_test(&sig->sigcnt)) free_signal_struct(sig); } void __put_task_struct(struct task_struct *tsk) { WARN_ON(!tsk->exit_state); WARN_ON(refcount_read(&tsk->usage)); WARN_ON(tsk == current); unwind_task_free(tsk); io_uring_free(tsk); cgroup_task_free(tsk); task_numa_free(tsk, true); security_task_free(tsk); exit_creds(tsk); delayacct_tsk_free(tsk); put_signal_struct(tsk->signal); sched_core_free(tsk); free_task(tsk); } EXPORT_SYMBOL_GPL(__put_task_struct); void __put_task_struct_rcu_cb(struct rcu_head *rhp) { struct task_struct *task = container_of(rhp, struct task_struct, rcu); __put_task_struct(task); } EXPORT_SYMBOL_GPL(__put_task_struct_rcu_cb); void __init __weak arch_task_cache_init(void) { } /* * set_max_threads */ static void __init set_max_threads(unsigned int max_threads_suggested) { u64 threads; unsigned long nr_pages = memblock_estimated_nr_free_pages(); /* * The number of threads shall be limited such that the thread * structures may only consume a small part of the available memory. */ if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64) threads = MAX_THREADS; else threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE, (u64) THREAD_SIZE * 8UL); if (threads > max_threads_suggested) threads = max_threads_suggested; max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS); } #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT /* Initialized by the architecture: */ int arch_task_struct_size __read_mostly; #endif static void __init task_struct_whitelist(unsigned long *offset, unsigned long *size) { /* Fetch thread_struct whitelist for the architecture. */ arch_thread_struct_whitelist(offset, size); /* * Handle zero-sized whitelist or empty thread_struct, otherwise * adjust offset to position of thread_struct in task_struct. */ if (unlikely(*size == 0)) *offset = 0; else *offset += offsetof(struct task_struct, thread); } void __init fork_init(void) { int i; #ifndef ARCH_MIN_TASKALIGN #define ARCH_MIN_TASKALIGN 0 #endif int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN); unsigned long useroffset, usersize; /* create a slab on which task_structs can be allocated */ task_struct_whitelist(&useroffset, &usersize); task_struct_cachep = kmem_cache_create_usercopy("task_struct", arch_task_struct_size, align, SLAB_PANIC|SLAB_ACCOUNT, useroffset, usersize, NULL); /* do the arch specific task caches init */ arch_task_cache_init(); set_max_threads(MAX_THREADS); init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; init_task.signal->rlim[RLIMIT_SIGPENDING] = init_task.signal->rlim[RLIMIT_NPROC]; for (i = 0; i < UCOUNT_COUNTS; i++) init_user_ns.ucount_max[i] = max_threads/2; set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, RLIM_INFINITY); set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, RLIM_INFINITY); set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY); set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, RLIM_INFINITY); #ifdef CONFIG_VMAP_STACK cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", NULL, free_vm_stack_cache); #endif scs_init(); lockdep_init_task(&init_task); uprobes_init(); } int __weak arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { *dst = *src; return 0; } void set_task_stack_end_magic(struct task_struct *tsk) { unsigned long *stackend; stackend = end_of_stack(tsk); *stackend = STACK_END_MAGIC; /* for overflow detection */ } static struct task_struct *dup_task_struct(struct task_struct *orig, int node) { struct task_struct *tsk; int err; if (node == NUMA_NO_NODE) node = tsk_fork_get_node(orig); tsk = alloc_task_struct_node(node); if (!tsk) return NULL; err = arch_dup_task_struct(tsk, orig); if (err) goto free_tsk; err = alloc_thread_stack_node(tsk, node); if (err) goto free_tsk; #ifdef CONFIG_THREAD_INFO_IN_TASK refcount_set(&tsk->stack_refcount, 1); #endif account_kernel_stack(tsk, 1); err = scs_prepare(tsk, node); if (err) goto free_stack; #ifdef CONFIG_SECCOMP /* * We must handle setting up seccomp filters once we're under * the sighand lock in case orig has changed between now and * then. Until then, filter must be NULL to avoid messing up * the usage counts on the error path calling free_task. */ tsk->seccomp.filter = NULL; #endif setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); set_task_stack_end_magic(tsk); clear_syscall_work_syscall_user_dispatch(tsk); #ifdef CONFIG_STACKPROTECTOR tsk->stack_canary = get_random_canary(); #endif if (orig->cpus_ptr == &orig->cpus_mask) tsk->cpus_ptr = &tsk->cpus_mask; dup_user_cpus_ptr(tsk, orig, node); /* * One for the user space visible state that goes away when reaped. * One for the scheduler. */ refcount_set(&tsk->rcu_users, 2); /* One for the rcu users */ refcount_set(&tsk->usage, 1); #ifdef CONFIG_BLK_DEV_IO_TRACE tsk->btrace_seq = 0; #endif tsk->splice_pipe = NULL; tsk->task_frag.page = NULL; tsk->wake_q.next = NULL; tsk->worker_private = NULL; kcov_task_init(tsk); kmsan_task_create(tsk); kmap_local_fork(tsk); #ifdef CONFIG_FAULT_INJECTION tsk->fail_nth = 0; #endif #ifdef CONFIG_BLK_CGROUP tsk->throttle_disk = NULL; tsk->use_memdelay = 0; #endif #ifdef CONFIG_ARCH_HAS_CPU_PASID tsk->pasid_activated = 0; #endif #ifdef CONFIG_MEMCG tsk->active_memcg = NULL; #endif #ifdef CONFIG_X86_BUS_LOCK_DETECT tsk->reported_split_lock = 0; #endif #ifdef CONFIG_SCHED_MM_CID tsk->mm_cid.cid = MM_CID_UNSET; tsk->mm_cid.active = 0; #endif return tsk; free_stack: exit_task_stack_account(tsk); free_thread_stack(tsk); free_tsk: free_task_struct(tsk); return NULL; } __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT; static int __init coredump_filter_setup(char *s) { default_dump_filter = (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) & MMF_DUMP_FILTER_MASK; return 1; } __setup("coredump_filter=", coredump_filter_setup); #include <linux/init_task.h> static void mm_init_aio(struct mm_struct *mm) { #ifdef CONFIG_AIO spin_lock_init(&mm->ioctx_lock); mm->ioctx_table = NULL; #endif } static __always_inline void mm_clear_owner(struct mm_struct *mm, struct task_struct *p) { #ifdef CONFIG_MEMCG if (mm->owner == p) WRITE_ONCE(mm->owner, NULL); #endif } static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) { #ifdef CONFIG_MEMCG mm->owner = p; #endif } static void mm_init_uprobes_state(struct mm_struct *mm) { #ifdef CONFIG_UPROBES mm->uprobes_state.xol_area = NULL; arch_uprobe_init_state(mm); #endif } static void mmap_init_lock(struct mm_struct *mm) { init_rwsem(&mm->mmap_lock); mm_lock_seqcount_init(mm); #ifdef CONFIG_PER_VMA_LOCK rcuwait_init(&mm->vma_writer_wait); #endif } static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { mt_init_flags(&mm->mm_mt, MM_MT_FLAGS); mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock); atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); seqcount_init(&mm->write_protect_seq); mmap_init_lock(mm); INIT_LIST_HEAD(&mm->mmlist); mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; atomic64_set(&mm->pinned_vm, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); spin_lock_init(&mm->arg_lock); mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); mm_pasid_init(mm); RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_subscriptions_init(mm); init_tlb_flush_pending(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS) mm->pmd_huge_pte = NULL; #endif mm_init_uprobes_state(mm); hugetlb_count_init(mm); mm_flags_clear_all(mm); if (current->mm) { unsigned long flags = __mm_flags_get_word(current->mm); __mm_flags_overwrite_word(mm, mmf_init_legacy_flags(flags)); mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK; } else { __mm_flags_overwrite_word(mm, default_dump_filter); mm->def_flags = 0; } if (futex_mm_init(mm)) goto fail_mm_init; if (mm_alloc_pgd(mm)) goto fail_nopgd; if (mm_alloc_id(mm)) goto fail_noid; if (init_new_context(p, mm)) goto fail_nocontext; if (mm_alloc_cid(mm, p)) goto fail_cid; if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, NR_MM_COUNTERS)) goto fail_pcpu; mm->user_ns = get_user_ns(user_ns); lru_gen_init_mm(mm); return mm; fail_pcpu: mm_destroy_cid(mm); fail_cid: destroy_context(mm); fail_nocontext: mm_free_id(mm); fail_noid: mm_free_pgd(mm); fail_nopgd: futex_hash_free(mm); fail_mm_init: free_mm(mm); return NULL; } /* * Allocate and initialize an mm_struct. */ struct mm_struct *mm_alloc(void) { struct mm_struct *mm; mm = allocate_mm(); if (!mm) return NULL; memset(mm, 0, sizeof(*mm)); return mm_init(mm, current, current_user_ns()); } EXPORT_SYMBOL_IF_KUNIT(mm_alloc); static inline void __mmput(struct mm_struct *mm) { VM_BUG_ON(atomic_read(&mm->mm_users)); uprobe_clear_state(mm); exit_aio(mm); ksm_exit(mm); khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); mm_put_huge_zero_folio(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); list_del(&mm->mmlist); spin_unlock(&mmlist_lock); } if (mm->binfmt) module_put(mm->binfmt->module); lru_gen_del_mm(mm); futex_hash_free(mm); mmdrop(mm); } /* * Decrement the use count and release all resources for an mm. */ void mmput(struct mm_struct *mm) { might_sleep(); if (atomic_dec_and_test(&mm->mm_users)) __mmput(mm); } EXPORT_SYMBOL_GPL(mmput); #if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH) static void mmput_async_fn(struct work_struct *work) { struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work); __mmput(mm); } void mmput_async(struct mm_struct *mm) { if (atomic_dec_and_test(&mm->mm_users)) { INIT_WORK(&mm->async_put_work, mmput_async_fn); schedule_work(&mm->async_put_work); } } EXPORT_SYMBOL_GPL(mmput_async); #endif /** * set_mm_exe_file - change a reference to the mm's executable file * @mm: The mm to change. * @new_exe_file: The new file to use. * * This changes mm's executable file (shown as symlink /proc/[pid]/exe). * * Main users are mmput() and sys_execve(). Callers prevent concurrent * invocations: in mmput() nobody alive left, in execve it happens before * the new mm is made visible to anyone. * * Can only fail if new_exe_file != NULL. */ int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) { struct file *old_exe_file; /* * It is safe to dereference the exe_file without RCU as * this function is only called if nobody else can access * this mm -- see comment above for justification. */ old_exe_file = rcu_dereference_raw(mm->exe_file); if (new_exe_file) { /* * We expect the caller (i.e., sys_execve) to already denied * write access, so this is unlikely to fail. */ if (unlikely(exe_file_deny_write_access(new_exe_file))) return -EACCES; get_file(new_exe_file); } rcu_assign_pointer(mm->exe_file, new_exe_file); if (old_exe_file) { exe_file_allow_write_access(old_exe_file); fput(old_exe_file); } return 0; } /** * replace_mm_exe_file - replace a reference to the mm's executable file * @mm: The mm to change. * @new_exe_file: The new file to use. * * This changes mm's executable file (shown as symlink /proc/[pid]/exe). * * Main user is sys_prctl(PR_SET_MM_MAP/EXE_FILE). */ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) { struct vm_area_struct *vma; struct file *old_exe_file; int ret = 0; /* Forbid mm->exe_file change if old file still mapped. */ old_exe_file = get_mm_exe_file(mm); if (old_exe_file) { VMA_ITERATOR(vmi, mm, 0); mmap_read_lock(mm); for_each_vma(vmi, vma) { if (!vma->vm_file) continue; if (path_equal(&vma->vm_file->f_path, &old_exe_file->f_path)) { ret = -EBUSY; break; } } mmap_read_unlock(mm); fput(old_exe_file); if (ret) return ret; } ret = exe_file_deny_write_access(new_exe_file); if (ret) return -EACCES; get_file(new_exe_file); /* set the new file */ mmap_write_lock(mm); old_exe_file = rcu_dereference_raw(mm->exe_file); rcu_assign_pointer(mm->exe_file, new_exe_file); mmap_write_unlock(mm); if (old_exe_file) { exe_file_allow_write_access(old_exe_file); fput(old_exe_file); } return 0; } /** * get_mm_exe_file - acquire a reference to the mm's executable file * @mm: The mm of interest. * * Returns %NULL if mm has no associated executable file. * User must release file via fput(). */ struct file *get_mm_exe_file(struct mm_struct *mm) { struct file *exe_file; rcu_read_lock(); exe_file = get_file_rcu(&mm->exe_file); rcu_read_unlock(); return exe_file; } /** * get_task_exe_file - acquire a reference to the task's executable file * @task: The task. * * Returns %NULL if task's mm (if any) has no associated executable file or * this is a kernel thread with borrowed mm (see the comment above get_task_mm). * User must release file via fput(). */ struct file *get_task_exe_file(struct task_struct *task) { struct file *exe_file = NULL; struct mm_struct *mm; if (task->flags & PF_KTHREAD) return NULL; task_lock(task); mm = task->mm; if (mm) exe_file = get_mm_exe_file(mm); task_unlock(task); return exe_file; } /** * get_task_mm - acquire a reference to the task's mm * @task: The task. * * Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning * this kernel workthread has transiently adopted a user mm with use_mm, * to do its AIO) is not set and if so returns a reference to it, after * bumping up the use count. User must release the mm via mmput() * after use. Typically used by /proc and ptrace. */ struct mm_struct *get_task_mm(struct task_struct *task) { struct mm_struct *mm; if (task->flags & PF_KTHREAD) return NULL; task_lock(task); mm = task->mm; if (mm) mmget(mm); task_unlock(task); return mm; } EXPORT_SYMBOL_GPL(get_task_mm); static bool may_access_mm(struct mm_struct *mm, struct task_struct *task, unsigned int mode) { if (mm == current->mm) return true; if (ptrace_may_access(task, mode)) return true; if ((mode & PTRACE_MODE_READ) && perfmon_capable()) return true; return false; } struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) { struct mm_struct *mm; int err; err = down_read_killable(&task->signal->exec_update_lock); if (err) return ERR_PTR(err); mm = get_task_mm(task); if (!mm) { mm = ERR_PTR(-ESRCH); } else if (!may_access_mm(mm, task, mode)) { mmput(mm); mm = ERR_PTR(-EACCES); } up_read(&task->signal->exec_update_lock); return mm; } static void complete_vfork_done(struct task_struct *tsk) { struct completion *vfork; task_lock(tsk); vfork = tsk->vfork_done; if (likely(vfork)) { tsk->vfork_done = NULL; complete(vfork); } task_unlock(tsk); } static int wait_for_vfork_done(struct task_struct *child, struct completion *vfork) { unsigned int state = TASK_KILLABLE|TASK_FREEZABLE; int killed; cgroup_enter_frozen(); killed = wait_for_completion_state(vfork, state); cgroup_leave_frozen(false); if (killed) { task_lock(child); child->vfork_done = NULL; task_unlock(child); } put_task_struct(child); return killed; } /* Please note the differences between mmput and mm_release. * mmput is called whenever we stop holding onto a mm_struct, * error success whatever. * * mm_release is called after a mm_struct has been removed * from the current process. * * This difference is important for error handling, when we * only half set up a mm_struct for a new process and need to restore * the old one. Because we mmput the new mm_struct before * restoring the old one. . . * Eric Biederman 10 January 1998 */ static void mm_release(struct task_struct *tsk, struct mm_struct *mm) { uprobe_free_utask(tsk); /* Get rid of any cached register state */ deactivate_mm(tsk, mm); /* * Signal userspace if we're not exiting with a core dump * because we want to leave the value intact for debugging * purposes. */ if (tsk->clear_child_tid) { if (atomic_read(&mm->mm_users) > 1) { /* * We don't check the error code - if userspace has * not set up a proper pointer then tough luck. */ put_user(0, tsk->clear_child_tid); do_futex(tsk->clear_child_tid, FUTEX_WAKE, 1, NULL, NULL, 0, 0); } tsk->clear_child_tid = NULL; } /* * All done, finally we can wake up parent and return this mm to him. * Also kthread_stop() uses this completion for synchronization. */ if (tsk->vfork_done) complete_vfork_done(tsk); } void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm) { futex_exit_release(tsk); mm_release(tsk, mm); } void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm) { futex_exec_release(tsk); mm_release(tsk, mm); } /** * dup_mm() - duplicates an existing mm structure * @tsk: the task_struct with which the new mm will be associated. * @oldmm: the mm to duplicate. * * Allocates a new mm structure and duplicates the provided @oldmm structure * content into it. * * Return: the duplicated mm or NULL on failure. */ static struct mm_struct *dup_mm(struct task_struct *tsk, struct mm_struct *oldmm) { struct mm_struct *mm; int err; mm = allocate_mm(); if (!mm) goto fail_nomem; memcpy(mm, oldmm, sizeof(*mm)); if (!mm_init(mm, tsk, mm->user_ns)) goto fail_nomem; uprobe_start_dup_mmap(); err = dup_mmap(mm, oldmm); if (err) goto free_pt; uprobe_end_dup_mmap(); mm->hiwater_rss = get_mm_rss(mm); mm->hiwater_vm = mm->total_vm; if (mm->binfmt && !try_module_get(mm->binfmt->module)) goto free_pt; return mm; free_pt: /* don't put binfmt in mmput, we haven't got module yet */ mm->binfmt = NULL; mm_init_owner(mm, NULL); mmput(mm); if (err) uprobe_end_dup_mmap(); fail_nomem: return NULL; } static int copy_mm(u64 clone_flags, struct task_struct *tsk) { struct mm_struct *mm, *oldmm; tsk->min_flt = tsk->maj_flt = 0; tsk->nvcsw = tsk->nivcsw = 0; #ifdef CONFIG_DETECT_HUNG_TASK tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw; tsk->last_switch_time = 0; #endif tsk->mm = NULL; tsk->active_mm = NULL; /* * Are we cloning a kernel thread? * * We need to steal a active VM for that.. */ oldmm = current->mm; if (!oldmm) return 0; if (clone_flags & CLONE_VM) { mmget(oldmm); mm = oldmm; } else { mm = dup_mm(tsk, current->mm); if (!mm) return -ENOMEM; } tsk->mm = mm; tsk->active_mm = mm; sched_mm_cid_fork(tsk); return 0; } static int copy_fs(u64 clone_flags, struct task_struct *tsk) { struct fs_struct *fs = current->fs; if (clone_flags & CLONE_FS) { /* tsk->fs is already what we want */ read_seqlock_excl(&fs->seq); /* "users" and "in_exec" locked for check_unsafe_exec() */ if (fs->in_exec) { read_sequnlock_excl(&fs->seq); return -EAGAIN; } fs->users++; read_sequnlock_excl(&fs->seq); return 0; } tsk->fs = copy_fs_struct(fs); if (!tsk->fs) return -ENOMEM; return 0; } static int copy_files(u64 clone_flags, struct task_struct *tsk, int no_files) { struct files_struct *oldf, *newf; /* * A background process may not have any files ... */ oldf = current->files; if (!oldf) return 0; if (no_files) { tsk->files = NULL; return 0; } if (clone_flags & CLONE_FILES) { atomic_inc(&oldf->count); return 0; } newf = dup_fd(oldf, NULL); if (IS_ERR(newf)) return PTR_ERR(newf); tsk->files = newf; return 0; } static int copy_sighand(u64 clone_flags, struct task_struct *tsk) { struct sighand_struct *sig; if (clone_flags & CLONE_SIGHAND) { refcount_inc(&current->sighand->count); return 0; } sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); RCU_INIT_POINTER(tsk->sighand, sig); if (!sig) return -ENOMEM; refcount_set(&sig->count, 1); spin_lock_irq(&current->sighand->siglock); memcpy(sig->action, current->sighand->action, sizeof(sig->action)); spin_unlock_irq(&current->sighand->siglock); /* Reset all signal handler not set to SIG_IGN to SIG_DFL. */ if (clone_flags & CLONE_CLEAR_SIGHAND) flush_signal_handlers(tsk, 0); return 0; } void __cleanup_sighand(struct sighand_struct *sighand) { if (refcount_dec_and_test(&sighand->count)) { signalfd_cleanup(sighand); /* * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it * without an RCU grace period, see __lock_task_sighand(). */ kmem_cache_free(sighand_cachep, sighand); } } /* * Initialize POSIX timer handling for a thread group. */ static void posix_cpu_timers_init_group(struct signal_struct *sig) { struct posix_cputimers *pct = &sig->posix_cputimers; unsigned long cpu_limit; cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); posix_cputimers_group_init(pct, cpu_limit); } static int copy_signal(u64 clone_flags, struct task_struct *tsk) { struct signal_struct *sig; if (clone_flags & CLONE_THREAD) return 0; sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL); tsk->signal = sig; if (!sig) return -ENOMEM; sig->nr_threads = 1; sig->quick_threads = 1; atomic_set(&sig->live, 1); refcount_set(&sig->sigcnt, 1); /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */ sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node); tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head); init_waitqueue_head(&sig->wait_chldexit); sig->curr_target = tsk; init_sigpending(&sig->shared_pending); INIT_HLIST_HEAD(&sig->multiprocess); seqlock_init(&sig->stats_lock); prev_cputime_init(&sig->prev_cputime); #ifdef CONFIG_POSIX_TIMERS INIT_HLIST_HEAD(&sig->posix_timers); INIT_HLIST_HEAD(&sig->ignored_posix_timers); hrtimer_setup(&sig->real_timer, it_real_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL); #endif task_lock(current->group_leader); memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); task_unlock(current->group_leader); posix_cpu_timers_init_group(sig); tty_audit_fork(sig); sched_autogroup_fork(sig); #ifdef CONFIG_CGROUPS init_rwsem(&sig->cgroup_threadgroup_rwsem); #endif sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; mutex_init(&sig->cred_guard_mutex); init_rwsem(&sig->exec_update_lock); return 0; } static void copy_seccomp(struct task_struct *p) { #ifdef CONFIG_SECCOMP /* * Must be called with sighand->lock held, which is common to * all threads in the group. Holding cred_guard_mutex is not * needed because this new task is not yet running and cannot * be racing exec. */ assert_spin_locked(&current->sighand->siglock); /* Ref-count the new filter user, and assign it. */ get_seccomp_filter(current); p->seccomp = current->seccomp; /* * Explicitly enable no_new_privs here in case it got set * between the task_struct being duplicated and holding the * sighand lock. The seccomp state and nnp must be in sync. */ if (task_no_new_privs(current)) task_set_no_new_privs(p); /* * If the parent gained a seccomp mode after copying thread * flags and between before we held the sighand lock, we have * to manually enable the seccomp thread flag here. */ if (p->seccomp.mode != SECCOMP_MODE_DISABLED) set_task_syscall_work(p, SECCOMP); #endif } SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) { current->clear_child_tid = tidptr; return task_pid_vnr(current); } static void rt_mutex_init_task(struct task_struct *p) { raw_spin_lock_init(&p->pi_lock); #ifdef CONFIG_RT_MUTEXES p->pi_waiters = RB_ROOT_CACHED; p->pi_top_task = NULL; p->pi_blocked_on = NULL; #endif } static inline void init_task_pid_links(struct task_struct *task) { enum pid_type type; for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) INIT_HLIST_NODE(&task->pid_links[type]); } static inline void init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid) { if (type == PIDTYPE_PID) task->thread_pid = pid; else task->signal->pids[type] = pid; } static inline void rcu_copy_process(struct task_struct *p) { #ifdef CONFIG_PREEMPT_RCU p->rcu_read_lock_nesting = 0; p->rcu_read_unlock_special.s = 0; p->rcu_blocked_node = NULL; INIT_LIST_HEAD(&p->rcu_node_entry); #endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TASKS_RCU p->rcu_tasks_holdout = false; INIT_LIST_HEAD(&p->rcu_tasks_holdout_list); p->rcu_tasks_idle_cpu = -1; INIT_LIST_HEAD(&p->rcu_tasks_exit_list); #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_TRACE_RCU p->trc_reader_nesting = 0; p->trc_reader_special.s = 0; INIT_LIST_HEAD(&p->trc_holdout_list); INIT_LIST_HEAD(&p->trc_blkd_node); #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } /** * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd * @pid: the struct pid for which to create a pidfd * @flags: flags of the new @pidfd * @ret_file: return the new pidfs file * * Allocate a new file that stashes @pid and reserve a new pidfd number in the * caller's file descriptor table. The pidfd is reserved but not installed yet. * * The helper verifies that @pid is still in use, without PIDFD_THREAD the * task identified by @pid must be a thread-group leader. * * If this function returns successfully the caller is responsible to either * call fd_install() passing the returned pidfd and pidfd file as arguments in * order to install the pidfd into its file descriptor table or they must use * put_unused_fd() and fput() on the returned pidfd and pidfd file * respectively. * * This function is useful when a pidfd must already be reserved but there * might still be points of failure afterwards and the caller wants to ensure * that no pidfd is leaked into its file descriptor table. * * Return: On success, a reserved pidfd is returned from the function and a new * pidfd file is returned in the last argument to the function. On * error, a negative error code is returned from the function and the * last argument remains unchanged. */ int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file) { struct file *pidfs_file; /* * PIDFD_STALE is only allowed to be passed if the caller knows * that @pid is already registered in pidfs and thus * PIDFD_INFO_EXIT information is guaranteed to be available. */ if (!(flags & PIDFD_STALE)) { /* * While holding the pidfd waitqueue lock removing the * task linkage for the thread-group leader pid * (PIDTYPE_TGID) isn't possible. Thus, if there's still * task linkage for PIDTYPE_PID not having thread-group * leader linkage for the pid means it wasn't a * thread-group leader in the first place. */ guard(spinlock_irq)(&pid->wait_pidfd.lock); /* Task has already been reaped. */ if (!pid_has_task(pid, PIDTYPE_PID)) return -ESRCH; /* * If this struct pid isn't used as a thread-group * leader but the caller requested to create a * thread-group leader pidfd then report ENOENT. */ if (!(flags & PIDFD_THREAD) && !pid_has_task(pid, PIDTYPE_TGID)) return -ENOENT; } CLASS(get_unused_fd, pidfd)(O_CLOEXEC); if (pidfd < 0) return pidfd; pidfs_file = pidfs_alloc_file(pid, flags | O_RDWR); if (IS_ERR(pidfs_file)) return PTR_ERR(pidfs_file); *ret_file = pidfs_file; return take_fd(pidfd); } static void __delayed_free_task(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); free_task(tsk); } static __always_inline void delayed_free_task(struct task_struct *tsk) { if (IS_ENABLED(CONFIG_MEMCG)) call_rcu(&tsk->rcu, __delayed_free_task); else free_task(tsk); } static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk) { /* Skip if kernel thread */ if (!tsk->mm) return; /* Skip if spawning a thread or using vfork */ if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM) return; /* We need to synchronize with __set_oom_adj */ mutex_lock(&oom_adj_mutex); mm_flags_set(MMF_MULTIPROCESS, tsk->mm); /* Update the values in case they were changed after copy_signal */ tsk->signal->oom_score_adj = current->signal->oom_score_adj; tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min; mutex_unlock(&oom_adj_mutex); } #ifdef CONFIG_RV static void rv_task_fork(struct task_struct *p) { memset(&p->rv, 0, sizeof(p->rv)); } #else #define rv_task_fork(p) do {} while (0) #endif static bool need_futex_hash_allocate_default(u64 clone_flags) { if ((clone_flags & (CLONE_THREAD | CLONE_VM)) != (CLONE_THREAD | CLONE_VM)) return false; return true; } /* * This creates a new process as a copy of the old one, * but does not actually start it yet. * * It copies the registers, and all the appropriate * parts of the process environment (as per the clone * flags). The actual kick-off is left to the caller. */ __latent_entropy struct task_struct *copy_process( struct pid *pid, int trace, int node, struct kernel_clone_args *args) { int pidfd = -1, retval; struct task_struct *p; struct multiprocess_signals delayed; struct file *pidfile = NULL; const u64 clone_flags = args->flags; struct nsproxy *nsp = current->nsproxy; /* * Don't allow sharing the root directory with processes in a different * namespace */ if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) return ERR_PTR(-EINVAL); /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. */ if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND)) return ERR_PTR(-EINVAL); /* * Shared signal handlers imply shared VM. By way of the above, * thread groups also imply shared VM. Blocking this case allows * for various simplifications in other code. */ if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) return ERR_PTR(-EINVAL); /* * Siblings of global init remain as zombies on exit since they are * not reaped by their parent (swapper). To solve this and to avoid * multi-rooted process trees, prevent global and container-inits * from creating siblings. */ if ((clone_flags & CLONE_PARENT) && current->signal->flags & SIGNAL_UNKILLABLE) return ERR_PTR(-EINVAL); /* * If the new process will be in a different pid or user namespace * do not allow it to share a thread group with the forking task. */ if (clone_flags & CLONE_THREAD) { if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || (task_active_pid_ns(current) != nsp->pid_ns_for_children)) return ERR_PTR(-EINVAL); } if (clone_flags & CLONE_PIDFD) { /* * - CLONE_DETACHED is blocked so that we can potentially * reuse it later for CLONE_PIDFD. */ if (clone_flags & CLONE_DETACHED) return ERR_PTR(-EINVAL); } /* * Force any signals received before this point to be delivered * before the fork happens. Collect up signals sent to multiple * processes that happen during the fork and delay them so that * they appear to happen after the fork. */ sigemptyset(&delayed.signal); INIT_HLIST_NODE(&delayed.node); spin_lock_irq(&current->sighand->siglock); if (!(clone_flags & CLONE_THREAD)) hlist_add_head(&delayed.node, &current->signal->multiprocess); recalc_sigpending(); spin_unlock_irq(&current->sighand->siglock); retval = -ERESTARTNOINTR; if (task_sigpending(current)) goto fork_out; retval = -ENOMEM; p = dup_task_struct(current, node); if (!p) goto fork_out; p->flags &= ~PF_KTHREAD; if (args->kthread) p->flags |= PF_KTHREAD; if (args->user_worker) { /* * Mark us a user worker, and block any signal that isn't * fatal or STOP */ p->flags |= PF_USER_WORKER; siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP)); } if (args->io_thread) p->flags |= PF_IO_WORKER; if (args->name) strscpy_pad(p->comm, args->name, sizeof(p->comm)); p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL; /* * Clear TID on mm_release()? */ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL; ftrace_graph_init_task(p); rt_mutex_init_task(p); lockdep_assert_irqs_enabled(); #ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif retval = copy_creds(p, clone_flags); if (retval < 0) goto bad_fork_free; retval = -EAGAIN; if (is_rlimit_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { if (p->real_cred->user != INIT_USER && !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) goto bad_fork_cleanup_count; } current->flags &= ~PF_NPROC_EXCEEDED; /* * If multiple threads are within copy_process(), then this check * triggers too late. This doesn't hurt, the check is only there * to stop root fork bombs. */ retval = -EAGAIN; if (data_race(nr_threads >= max_threads)) goto bad_fork_cleanup_count; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE | PF_NO_SETAFFINITY); p->flags |= PF_FORKNOEXEC; INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); rcu_copy_process(p); p->vfork_done = NULL; spin_lock_init(&p->alloc_lock); init_sigpending(&p->pending); p->utime = p->stime = p->gtime = 0; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME p->utimescaled = p->stimescaled = 0; #endif prev_cputime_init(&p->prev_cputime); #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN seqcount_init(&p->vtime.seqcount); p->vtime.starttime = 0; p->vtime.state = VTIME_INACTIVE; #endif #ifdef CONFIG_IO_URING p->io_uring = NULL; #endif p->default_timer_slack_ns = current->timer_slack_ns; #ifdef CONFIG_PSI p->psi_flags = 0; #endif task_io_accounting_init(&p->ioac); acct_clear_integrals(p); posix_cputimers_init(&p->posix_cputimers); tick_dep_init_task(p); p->io_context = NULL; audit_set_context(p, NULL); cgroup_fork(p); if (args->kthread) { if (!set_kthread_struct(p)) goto bad_fork_cleanup_delayacct; } #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; goto bad_fork_cleanup_delayacct; } #endif #ifdef CONFIG_CPUSETS p->cpuset_mem_spread_rotor = NUMA_NO_NODE; seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock); #endif #ifdef CONFIG_TRACE_IRQFLAGS memset(&p->irqtrace, 0, sizeof(p->irqtrace)); p->irqtrace.hardirq_disable_ip = _THIS_IP_; p->irqtrace.softirq_enable_ip = _THIS_IP_; p->softirqs_enabled = 1; p->softirq_context = 0; #endif p->pagefault_disabled = 0; lockdep_init_task(p); p->blocked_on = NULL; /* not blocked yet */ #ifdef CONFIG_BCACHE p->sequential_io = 0; p->sequential_io_avg = 0; #endif #ifdef CONFIG_BPF_SYSCALL RCU_INIT_POINTER(p->bpf_storage, NULL); p->bpf_ctx = NULL; #endif unwind_task_init(p); /* Perform scheduler related setup. Assign this task to a CPU. */ retval = sched_fork(clone_flags, p); if (retval) goto bad_fork_cleanup_policy; retval = perf_event_init_task(p, clone_flags); if (retval) goto bad_fork_sched_cancel_fork; retval = audit_alloc(p); if (retval) goto bad_fork_cleanup_perf; /* copy all the process information */ shm_init_task(p); retval = security_task_alloc(p, clone_flags); if (retval) goto bad_fork_cleanup_audit; retval = copy_semundo(clone_flags, p); if (retval) goto bad_fork_cleanup_security; retval = copy_files(clone_flags, p, args->no_files); if (retval) goto bad_fork_cleanup_semundo; retval = copy_fs(clone_flags, p); if (retval) goto bad_fork_cleanup_files; retval = copy_sighand(clone_flags, p); if (retval) goto bad_fork_cleanup_fs; retval = copy_signal(clone_flags, p); if (retval) goto bad_fork_cleanup_sighand; retval = copy_mm(clone_flags, p); if (retval) goto bad_fork_cleanup_signal; retval = copy_namespaces(clone_flags, p); if (retval) goto bad_fork_cleanup_mm; retval = copy_io(clone_flags, p); if (retval) goto bad_fork_cleanup_namespaces; retval = copy_thread(p, args); if (retval) goto bad_fork_cleanup_io; stackleak_task_init(p); if (pid != &init_struct_pid) { pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid, args->set_tid_size); if (IS_ERR(pid)) { retval = PTR_ERR(pid); goto bad_fork_cleanup_thread; } } /* * This has to happen after we've potentially unshared the file * descriptor table (so that the pidfd doesn't leak into the child * if the fd table isn't shared). */ if (clone_flags & CLONE_PIDFD) { int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0; /* * Note that no task has been attached to @pid yet indicate * that via CLONE_PIDFD. */ retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile); if (retval < 0) goto bad_fork_free_pid; pidfd = retval; retval = put_user(pidfd, args->pidfd); if (retval) goto bad_fork_put_pidfd; } #ifdef CONFIG_BLOCK p->plug = NULL; #endif futex_init_task(p); /* * sigaltstack should be cleared when sharing the same VM */ if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM) sas_ss_reset(p); /* * Syscall tracing and stepping should be turned off in the * child regardless of CLONE_PTRACE. */ user_disable_single_step(p); clear_task_syscall_work(p, SYSCALL_TRACE); #if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU) clear_task_syscall_work(p, SYSCALL_EMU); #endif clear_tsk_latency_tracing(p); /* ok, now we should be set up.. */ p->pid = pid_nr(pid); if (clone_flags & CLONE_THREAD) { p->group_leader = current->group_leader; p->tgid = current->tgid; } else { p->group_leader = p; p->tgid = p->pid; } p->nr_dirtied = 0; p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); p->dirty_paused_when = 0; p->pdeath_signal = 0; p->task_works = NULL; clear_posix_cputimers_work(p); #ifdef CONFIG_KRETPROBES p->kretprobe_instances.first = NULL; #endif #ifdef CONFIG_RETHOOK p->rethooks.first = NULL; #endif /* * Ensure that the cgroup subsystem policies allow the new process to be * forked. It should be noted that the new process's css_set can be changed * between here and cgroup_post_fork() if an organisation operation is in * progress. */ retval = cgroup_can_fork(p, args); if (retval) goto bad_fork_put_pidfd; /* * Now that the cgroups are pinned, re-clone the parent cgroup and put * the new task on the correct runqueue. All this *before* the task * becomes visible. * * This isn't part of ->can_fork() because while the re-cloning is * cgroup specific, it unconditionally needs to place the task on a * runqueue. */ retval = sched_cgroup_fork(p, args); if (retval) goto bad_fork_cancel_cgroup; /* * Allocate a default futex hash for the user process once the first * thread spawns. */ if (need_futex_hash_allocate_default(clone_flags)) { retval = futex_hash_allocate_default(); if (retval) goto bad_fork_cancel_cgroup; /* * If we fail beyond this point we don't free the allocated * futex hash map. We assume that another thread will be created * and makes use of it. The hash map will be freed once the main * thread terminates. */ } /* * From this point on we must avoid any synchronous user-space * communication until we take the tasklist-lock. In particular, we do * not want user-space to be able to predict the process start-time by * stalling fork(2) after we recorded the start_time but before it is * visible to the system. */ p->start_time = ktime_get_ns(); p->start_boottime = ktime_get_boottime_ns(); /* * Make it visible to the rest of the system, but dont wake it up yet. * Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; p->parent_exec_id = current->parent_exec_id; if (clone_flags & CLONE_THREAD) p->exit_signal = -1; else p->exit_signal = current->group_leader->exit_signal; } else { p->real_parent = current; p->parent_exec_id = current->self_exec_id; p->exit_signal = args->exit_signal; } klp_copy_process(p); sched_core_fork(p); spin_lock(&current->sighand->siglock); rv_task_fork(p); rseq_fork(p, clone_flags); /* Don't start children in a dying pid namespace */ if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) { retval = -ENOMEM; goto bad_fork_core_free; } /* Let kill terminate clone/fork in the middle */ if (fatal_signal_pending(current)) { retval = -EINTR; goto bad_fork_core_free; } /* No more failure paths after this point. */ /* * Copy seccomp details explicitly here, in case they were changed * before holding sighand lock. */ copy_seccomp(p); init_task_pid_links(p); if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); init_task_pid(p, PIDTYPE_PID, pid); if (thread_group_leader(p)) { init_task_pid(p, PIDTYPE_TGID, pid); init_task_pid(p, PIDTYPE_PGID, task_pgrp(current)); init_task_pid(p, PIDTYPE_SID, task_session(current)); if (is_child_reaper(pid)) { ns_of_pid(pid)->child_reaper = p; p->signal->flags |= SIGNAL_UNKILLABLE; } p->signal->shared_pending.signal = delayed.signal; p->signal->tty = tty_kref_get(current->signal->tty); /* * Inherit has_child_subreaper flag under the same * tasklist_lock with adding child to the process tree * for propagate_has_child_subreaper optimization. */ p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper || p->real_parent->signal->is_child_subreaper; list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); attach_pid(p, PIDTYPE_TGID); attach_pid(p, PIDTYPE_PGID); attach_pid(p, PIDTYPE_SID); __this_cpu_inc(process_counts); } else { current->signal->nr_threads++; current->signal->quick_threads++; atomic_inc(&current->signal->live); refcount_inc(&current->signal->sigcnt); task_join_group_stop(p); list_add_tail_rcu(&p->thread_node, &p->signal->thread_head); } attach_pid(p, PIDTYPE_PID); nr_threads++; } total_forks++; hlist_del_init(&delayed.node); spin_unlock(&current->sighand->siglock); syscall_tracepoint_update(p); write_unlock_irq(&tasklist_lock); if (pidfile) fd_install(pidfd, pidfile); proc_fork_connector(p); sched_post_fork(p); cgroup_post_fork(p, args); perf_event_fork(p); trace_task_newtask(p, clone_flags); uprobe_copy_process(p, clone_flags); user_events_fork(p, clone_flags); copy_oom_score_adj(clone_flags, p); return p; bad_fork_core_free: sched_core_free(p); spin_unlock(&current->sighand->siglock); write_unlock_irq(&tasklist_lock); bad_fork_cancel_cgroup: cgroup_cancel_fork(p, args); bad_fork_put_pidfd: if (clone_flags & CLONE_PIDFD) { fput(pidfile); put_unused_fd(pidfd); } bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); bad_fork_cleanup_thread: exit_thread(p); bad_fork_cleanup_io: if (p->io_context) exit_io_context(p); bad_fork_cleanup_namespaces: exit_nsproxy_namespaces(p); bad_fork_cleanup_mm: if (p->mm) { sched_mm_cid_exit(p); mm_clear_owner(p->mm, p); mmput(p->mm); } bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) free_signal_struct(p->signal); bad_fork_cleanup_sighand: __cleanup_sighand(p->sighand); bad_fork_cleanup_fs: exit_fs(p); /* blocking */ bad_fork_cleanup_files: exit_files(p); /* blocking */ bad_fork_cleanup_semundo: exit_sem(p); bad_fork_cleanup_security: security_task_free(p); bad_fork_cleanup_audit: audit_free(p); bad_fork_cleanup_perf: perf_event_free_task(p); bad_fork_sched_cancel_fork: sched_cancel_fork(p); bad_fork_cleanup_policy: lockdep_free_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); #endif bad_fork_cleanup_delayacct: delayacct_tsk_free(p); bad_fork_cleanup_count: dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); exit_cred_namespaces(p); exit_creds(p); bad_fork_free: WRITE_ONCE(p->__state, TASK_DEAD); exit_task_stack_account(p); put_task_stack(p); delayed_free_task(p); fork_out: spin_lock_irq(&current->sighand->siglock); hlist_del_init(&delayed.node); spin_unlock_irq(&current->sighand->siglock); return ERR_PTR(retval); } static inline void init_idle_pids(struct task_struct *idle) { enum pid_type type; for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) { INIT_HLIST_NODE(&idle->pid_links[type]); /* not really needed */ init_task_pid(idle, type, &init_struct_pid); } } static int idle_dummy(void *dummy) { /* This function is never called */ return 0; } struct task_struct * __init fork_idle(int cpu) { struct task_struct *task; struct kernel_clone_args args = { .flags = CLONE_VM, .fn = &idle_dummy, .fn_arg = NULL, .kthread = 1, .idle = 1, }; task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args); if (!IS_ERR(task)) { init_idle_pids(task); init_idle(task, cpu); } return task; } /* * This is like kernel_clone(), but shaved down and tailored to just * creating io_uring workers. It returns a created task, or an error pointer. * The returned task is inactive, and the caller must fire it up through * wake_up_new_task(p). All signals are blocked in the created task. */ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) { unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| CLONE_IO|CLONE_VM|CLONE_UNTRACED; struct kernel_clone_args args = { .flags = flags, .fn = fn, .fn_arg = arg, .io_thread = 1, .user_worker = 1, }; return copy_process(NULL, 0, node, &args); } /* * Ok, this is the main fork-routine. * * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. * * args->exit_signal is expected to be checked for sanity by the caller. */ pid_t kernel_clone(struct kernel_clone_args *args) { u64 clone_flags = args->flags; struct completion vfork; struct pid *pid; struct task_struct *p; int trace = 0; pid_t nr; /* * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate * field in struct clone_args and it still doesn't make sense to have * them both point at the same memory location. Performing this check * here has the advantage that we don't need to have a separate helper * to check for legacy clone(). */ if ((clone_flags & CLONE_PIDFD) && (clone_flags & CLONE_PARENT_SETTID) && (args->pidfd == args->parent_tid)) return -EINVAL; /* * Determine whether and which event to report to ptracer. When * called from kernel_thread or CLONE_UNTRACED is explicitly * requested, no event is reported; otherwise, report if the event * for the type of forking is enabled. */ if (!(clone_flags & CLONE_UNTRACED)) { if (clone_flags & CLONE_VFORK) trace = PTRACE_EVENT_VFORK; else if (args->exit_signal != SIGCHLD) trace = PTRACE_EVENT_CLONE; else trace = PTRACE_EVENT_FORK; if (likely(!ptrace_event_enabled(current, trace))) trace = 0; } p = copy_process(NULL, trace, NUMA_NO_NODE, args); add_latent_entropy(); if (IS_ERR(p)) return PTR_ERR(p); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. */ trace_sched_process_fork(current, p); pid = get_task_pid(p, PIDTYPE_PID); nr = pid_vnr(pid); if (clone_flags & CLONE_PARENT_SETTID) put_user(nr, args->parent_tid); if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; init_completion(&vfork); get_task_struct(p); } if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) { /* lock the task to synchronize with memcg migration */ task_lock(p); lru_gen_add_mm(p->mm); task_unlock(p); } wake_up_new_task(p); /* forking complete and child started to run, tell ptracer */ if (unlikely(trace)) ptrace_event_pid(trace, pid); if (clone_flags & CLONE_VFORK) { if (!wait_for_vfork_done(p, &vfork)) ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid); } put_pid(pid); return nr; } /* * Create a kernel thread. */ pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name, unsigned long flags) { struct kernel_clone_args args = { .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), .exit_signal = (flags & CSIGNAL), .fn = fn, .fn_arg = arg, .name = name, .kthread = 1, }; return kernel_clone(&args); } /* * Create a user mode thread. */ pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags) { struct kernel_clone_args args = { .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), .exit_signal = (flags & CSIGNAL), .fn = fn, .fn_arg = arg, }; return kernel_clone(&args); } #ifdef __ARCH_WANT_SYS_FORK SYSCALL_DEFINE0(fork) { #ifdef CONFIG_MMU struct kernel_clone_args args = { .exit_signal = SIGCHLD, }; return kernel_clone(&args); #else /* can not support in nommu mode */ return -EINVAL; #endif } #endif #ifdef __ARCH_WANT_SYS_VFORK SYSCALL_DEFINE0(vfork) { struct kernel_clone_args args = { .flags = CLONE_VFORK | CLONE_VM, .exit_signal = SIGCHLD, }; return kernel_clone(&args); } #endif #ifdef __ARCH_WANT_SYS_CLONE #ifdef CONFIG_CLONE_BACKWARDS SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int __user *, parent_tidptr, unsigned long, tls, int __user *, child_tidptr) #elif defined(CONFIG_CLONE_BACKWARDS2) SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags, int __user *, parent_tidptr, int __user *, child_tidptr, unsigned long, tls) #elif defined(CONFIG_CLONE_BACKWARDS3) SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp, int, stack_size, int __user *, parent_tidptr, int __user *, child_tidptr, unsigned long, tls) #else SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int __user *, parent_tidptr, int __user *, child_tidptr, unsigned long, tls) #endif { struct kernel_clone_args args = { .flags = (lower_32_bits(clone_flags) & ~CSIGNAL), .pidfd = parent_tidptr, .child_tid = child_tidptr, .parent_tid = parent_tidptr, .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL), .stack = newsp, .tls = tls, }; return kernel_clone(&args); } #endif static noinline int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, size_t usize) { int err; struct clone_args args; pid_t *kset_tid = kargs->set_tid; BUILD_BUG_ON(offsetofend(struct clone_args, tls) != CLONE_ARGS_SIZE_VER0); BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) != CLONE_ARGS_SIZE_VER1); BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) != CLONE_ARGS_SIZE_VER2); BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2); if (unlikely(usize > PAGE_SIZE)) return -E2BIG; if (unlikely(usize < CLONE_ARGS_SIZE_VER0)) return -EINVAL; err = copy_struct_from_user(&args, sizeof(args), uargs, usize); if (err) return err; if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL)) return -EINVAL; if (unlikely(!args.set_tid && args.set_tid_size > 0)) return -EINVAL; if (unlikely(args.set_tid && args.set_tid_size == 0)) return -EINVAL; /* * Verify that higher 32bits of exit_signal are unset and that * it is a valid signal */ if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) || !valid_signal(args.exit_signal))) return -EINVAL; if ((args.flags & CLONE_INTO_CGROUP) && (args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2)) return -EINVAL; *kargs = (struct kernel_clone_args){ .flags = args.flags, .pidfd = u64_to_user_ptr(args.pidfd), .child_tid = u64_to_user_ptr(args.child_tid), .parent_tid = u64_to_user_ptr(args.parent_tid), .exit_signal = args.exit_signal, .stack = args.stack, .stack_size = args.stack_size, .tls = args.tls, .set_tid_size = args.set_tid_size, .cgroup = args.cgroup, }; if (args.set_tid && copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid), (kargs->set_tid_size * sizeof(pid_t)))) return -EFAULT; kargs->set_tid = kset_tid; return 0; } /** * clone3_stack_valid - check and prepare stack * @kargs: kernel clone args * * Verify that the stack arguments userspace gave us are sane. * In addition, set the stack direction for userspace since it's easy for us to * determine. */ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs) { if (kargs->stack == 0) { if (kargs->stack_size > 0) return false; } else { if (kargs->stack_size == 0) return false; if (!access_ok((void __user *)kargs->stack, kargs->stack_size)) return false; #if !defined(CONFIG_STACK_GROWSUP) kargs->stack += kargs->stack_size; #endif } return true; } static bool clone3_args_valid(struct kernel_clone_args *kargs) { /* Verify that no unknown flags are passed along. */ if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP)) return false; /* * - make the CLONE_DETACHED bit reusable for clone3 * - make the CSIGNAL bits reusable for clone3 */ if (kargs->flags & (CLONE_DETACHED | (CSIGNAL & (~CLONE_NEWTIME)))) return false; if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) == (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) return false; if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) && kargs->exit_signal) return false; if (!clone3_stack_valid(kargs)) return false; return true; } /** * sys_clone3 - create a new process with specific properties * @uargs: argument structure * @size: size of @uargs * * clone3() is the extensible successor to clone()/clone2(). * It takes a struct as argument that is versioned by its size. * * Return: On success, a positive PID for the child process. * On error, a negative errno number. */ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) { int err; struct kernel_clone_args kargs; pid_t set_tid[MAX_PID_NS_LEVEL]; #ifdef __ARCH_BROKEN_SYS_CLONE3 #warning clone3() entry point is missing, please fix return -ENOSYS; #endif kargs.set_tid = set_tid; err = copy_clone_args_from_user(&kargs, uargs, size); if (err) return err; if (!clone3_args_valid(&kargs)) return -EINVAL; return kernel_clone(&kargs); } void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data) { struct task_struct *leader, *parent, *child; int res; read_lock(&tasklist_lock); leader = top = top->group_leader; down: for_each_thread(leader, parent) { list_for_each_entry(child, &parent->children, sibling) { res = visitor(child, data); if (res) { if (res < 0) goto out; leader = child; goto down; } up: ; } } if (leader != top) { child = leader; parent = child->real_parent; leader = parent->group_leader; goto up; } out: read_unlock(&tasklist_lock); } #ifndef ARCH_MIN_MMSTRUCT_ALIGN #define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif static void sighand_ctor(void *data) { struct sighand_struct *sighand = data; spin_lock_init(&sighand->siglock); init_waitqueue_head(&sighand->signalfd_wqh); } void __init mm_cache_init(void) { unsigned int mm_size; /* * The mm_cpumask is located at the end of mm_struct, and is * dynamically sized based on the maximum CPU number this system * can have, taking hotplug into account (nr_cpu_ids). */ mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size(); mm_cachep = kmem_cache_create_usercopy("mm_struct", mm_size, ARCH_MIN_MMSTRUCT_ALIGN, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, offsetof(struct mm_struct, saved_auxv), sizeof_field(struct mm_struct, saved_auxv), NULL); } void __init proc_caches_init(void) { sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| SLAB_ACCOUNT, sighand_ctor); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); files_cachep = kmem_cache_create("files_cache", sizeof(struct files_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); mmap_init(); nsproxy_cache_init(); } /* * Check constraints on flags passed to the unshare system call. */ static int check_unshare_flags(unsigned long unshare_flags) { if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP| CLONE_NEWTIME)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing * to unshare. Note that unsharing the address space or the * signal handlers also need to unshare the signal queues (aka * CLONE_THREAD). */ if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) { if (!thread_group_empty(current)) return -EINVAL; } if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) { if (refcount_read(&current->sighand->count) > 1) return -EINVAL; } if (unshare_flags & CLONE_VM) { if (!current_is_single_threaded()) return -EINVAL; } return 0; } /* * Unshare the filesystem structure if it is being shared */ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) { struct fs_struct *fs = current->fs; if (!(unshare_flags & CLONE_FS) || !fs) return 0; /* don't need lock here; in the worst case we'll do useless copy */ if (fs->users == 1) return 0; *new_fsp = copy_fs_struct(fs); if (!*new_fsp) return -ENOMEM; return 0; } /* * Unshare file descriptor table if it is being shared */ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) { struct files_struct *fd = current->files; if ((unshare_flags & CLONE_FILES) && (fd && atomic_read(&fd->count) > 1)) { fd = dup_fd(fd, NULL); if (IS_ERR(fd)) return PTR_ERR(fd); *new_fdp = fd; } return 0; } /* * unshare allows a process to 'unshare' part of the process * context which was originally shared using clone. copy_* * functions used by kernel_clone() cannot be used here directly * because they modify an inactive task_struct that is being * constructed. Here we are modifying the current, active, * task_struct. */ int ksys_unshare(unsigned long unshare_flags) { struct fs_struct *fs, *new_fs = NULL; struct files_struct *new_fd = NULL; struct cred *new_cred = NULL; struct nsproxy *new_nsproxy = NULL; int do_sysvsem = 0; int err; /* * If unsharing a user namespace must also unshare the thread group * and unshare the filesystem root and working directories. */ if (unshare_flags & CLONE_NEWUSER) unshare_flags |= CLONE_THREAD | CLONE_FS; /* * If unsharing vm, must also unshare signal handlers. */ if (unshare_flags & CLONE_VM) unshare_flags |= CLONE_SIGHAND; /* * If unsharing a signal handlers, must also unshare the signal queues. */ if (unshare_flags & CLONE_SIGHAND) unshare_flags |= CLONE_THREAD; /* * If unsharing namespace, must also unshare filesystem information. */ if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; err = check_unshare_flags(unshare_flags); if (err) goto bad_unshare_out; /* * CLONE_NEWIPC must also detach from the undolist: after switching * to a new ipc namespace, the semaphore arrays from the old * namespace are unreachable. */ if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) do_sysvsem = 1; err = unshare_fs(unshare_flags, &new_fs); if (err) goto bad_unshare_out; err = unshare_fd(unshare_flags, &new_fd); if (err) goto bad_unshare_cleanup_fs; err = unshare_userns(unshare_flags, &new_cred); if (err) goto bad_unshare_cleanup_fd; err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_cred, new_fs); if (err) goto bad_unshare_cleanup_cred; if (new_cred) { err = set_cred_ucounts(new_cred); if (err) goto bad_unshare_cleanup_cred; } if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) { if (do_sysvsem) { /* * CLONE_SYSVSEM is equivalent to sys_exit(). */ exit_sem(current); } if (unshare_flags & CLONE_NEWIPC) { /* Orphan segments in old ns (see sem above). */ exit_shm(current); shm_init_task(current); } if (new_nsproxy) switch_task_namespaces(current, new_nsproxy); task_lock(current); if (new_fs) { fs = current->fs; read_seqlock_excl(&fs->seq); current->fs = new_fs; if (--fs->users) new_fs = NULL; else new_fs = fs; read_sequnlock_excl(&fs->seq); } if (new_fd) swap(current->files, new_fd); task_unlock(current); if (new_cred) { /* Install the new user namespace */ commit_creds(new_cred); new_cred = NULL; } } perf_event_namespaces(current); bad_unshare_cleanup_cred: if (new_cred) put_cred(new_cred); bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); bad_unshare_cleanup_fs: if (new_fs) free_fs_struct(new_fs); bad_unshare_out: return err; } SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) { return ksys_unshare(unshare_flags); } /* * Helper to unshare the files of the current task. * We don't want to expose copy_files internals to * the exec layer of the kernel. */ int unshare_files(void) { struct task_struct *task = current; struct files_struct *old, *copy = NULL; int error; error = unshare_fd(CLONE_FILES, &copy); if (error || !copy) return error; old = task->files; task_lock(task); task->files = copy; task_unlock(task); put_files_struct(old); return 0; } static int sysctl_max_threads(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int ret; int threads = max_threads; int min = 1; int max = MAX_THREADS; t = *table; t.data = &threads; t.extra1 = &min; t.extra2 = &max; ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); if (ret || !write) return ret; max_threads = threads; return 0; } static const struct ctl_table fork_sysctl_table[] = { { .procname = "threads-max", .data = NULL, .maxlen = sizeof(int), .mode = 0644, .proc_handler = sysctl_max_threads, }, }; static int __init init_fork_sysctl(void) { register_sysctl_init("kernel", fork_sysctl_table); return 0; } subsys_initcall(init_fork_sysctl);
13 13 13 4 4 6 13 13 13 13 11 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 // SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2003-2008 Takahiro Hirofuchi * Copyright (C) 2015-2016 Nobuo Iwata */ #include <linux/init.h> #include <linux/file.h> #include <linux/kernel.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/platform_device.h> #include <linux/slab.h> #include <linux/string_choices.h> #include "usbip_common.h" #include "vhci.h" #define DRIVER_AUTHOR "Takahiro Hirofuchi" #define DRIVER_DESC "USB/IP 'Virtual' Host Controller (VHCI) Driver" /* * TODO * - update root hub emulation * - move the emulation code to userland ? * porting to other operating systems * minimize kernel code * - add suspend/resume code * - clean up everything */ /* See usb gadget dummy hcd */ static int vhci_hub_status(struct usb_hcd *hcd, char *buff); static int vhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, u16 wIndex, char *buff, u16 wLength); static int vhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flags); static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status); static int vhci_start(struct usb_hcd *vhci_hcd); static void vhci_stop(struct usb_hcd *hcd); static int vhci_get_frame_number(struct usb_hcd *hcd); static const char driver_name[] = "vhci_hcd"; static const char driver_desc[] = "USB/IP Virtual Host Controller"; int vhci_num_controllers = VHCI_NR_HCS; struct vhci *vhcis; static const char * const bit_desc[] = { "CONNECTION", /*0*/ "ENABLE", /*1*/ "SUSPEND", /*2*/ "OVER_CURRENT", /*3*/ "RESET", /*4*/ "L1", /*5*/ "R6", /*6*/ "R7", /*7*/ "POWER", /*8*/ "LOWSPEED", /*9*/ "HIGHSPEED", /*10*/ "PORT_TEST", /*11*/ "INDICATOR", /*12*/ "R13", /*13*/ "R14", /*14*/ "R15", /*15*/ "C_CONNECTION", /*16*/ "C_ENABLE", /*17*/ "C_SUSPEND", /*18*/ "C_OVER_CURRENT", /*19*/ "C_RESET", /*20*/ "C_L1", /*21*/ "R22", /*22*/ "R23", /*23*/ "R24", /*24*/ "R25", /*25*/ "R26", /*26*/ "R27", /*27*/ "R28", /*28*/ "R29", /*29*/ "R30", /*30*/ "R31", /*31*/ }; static const char * const bit_desc_ss[] = { "CONNECTION", /*0*/ "ENABLE", /*1*/ "SUSPEND", /*2*/ "OVER_CURRENT", /*3*/ "RESET", /*4*/ "L1", /*5*/ "R6", /*6*/ "R7", /*7*/ "R8", /*8*/ "POWER", /*9*/ "HIGHSPEED", /*10*/ "PORT_TEST", /*11*/ "INDICATOR", /*12*/ "R13", /*13*/ "R14", /*14*/ "R15", /*15*/ "C_CONNECTION", /*16*/ "C_ENABLE", /*17*/ "C_SUSPEND", /*18*/ "C_OVER_CURRENT", /*19*/ "C_RESET", /*20*/ "C_BH_RESET", /*21*/ "C_LINK_STATE", /*22*/ "C_CONFIG_ERROR", /*23*/ "R24", /*24*/ "R25", /*25*/ "R26", /*26*/ "R27", /*27*/ "R28", /*28*/ "R29", /*29*/ "R30", /*30*/ "R31", /*31*/ }; static void dump_port_status_diff(u32 prev_status, u32 new_status, bool usb3) { int i = 0; u32 bit = 1; const char * const *desc = bit_desc; if (usb3) desc = bit_desc_ss; pr_debug("status prev -> new: %08x -> %08x\n", prev_status, new_status); while (bit) { u32 prev = prev_status & bit; u32 new = new_status & bit; char change; if (!prev && new) change = '+'; else if (prev && !new) change = '-'; else change = ' '; if (prev || new) { pr_debug(" %c%s\n", change, desc[i]); if (bit == 1) /* USB_PORT_STAT_CONNECTION */ pr_debug(" %c%s\n", change, "USB_PORT_STAT_SPEED_5GBPS"); } bit <<= 1; i++; } pr_debug("\n"); } void rh_port_connect(struct vhci_device *vdev, enum usb_device_speed speed) { struct vhci_hcd *vhci_hcd = vdev_to_vhci_hcd(vdev); struct vhci *vhci = vhci_hcd->vhci; int rhport = vdev->rhport; u32 status; unsigned long flags; usbip_dbg_vhci_rh("rh_port_connect %d\n", rhport); spin_lock_irqsave(&vhci->lock, flags); status = vhci_hcd->port_status[rhport]; status |= USB_PORT_STAT_CONNECTION | (1 << USB_PORT_FEAT_C_CONNECTION); switch (speed) { case USB_SPEED_HIGH: status |= USB_PORT_STAT_HIGH_SPEED; break; case USB_SPEED_LOW: status |= USB_PORT_STAT_LOW_SPEED; break; default: break; } vhci_hcd->port_status[rhport] = status; spin_unlock_irqrestore(&vhci->lock, flags); usb_hcd_poll_rh_status(vhci_hcd_to_hcd(vhci_hcd)); } static void rh_port_disconnect(struct vhci_device *vdev) { struct vhci_hcd *vhci_hcd = vdev_to_vhci_hcd(vdev); struct vhci *vhci = vhci_hcd->vhci; int rhport = vdev->rhport; u32 status; unsigned long flags; usbip_dbg_vhci_rh("rh_port_disconnect %d\n", rhport); spin_lock_irqsave(&vhci->lock, flags); status = vhci_hcd->port_status[rhport]; status &= ~USB_PORT_STAT_CONNECTION; status |= (1 << USB_PORT_FEAT_C_CONNECTION); vhci_hcd->port_status[rhport] = status; spin_unlock_irqrestore(&vhci->lock, flags); usb_hcd_poll_rh_status(vhci_hcd_to_hcd(vhci_hcd)); } #define PORT_C_MASK \ ((USB_PORT_STAT_C_CONNECTION \ | USB_PORT_STAT_C_ENABLE \ | USB_PORT_STAT_C_SUSPEND \ | USB_PORT_STAT_C_OVERCURRENT \ | USB_PORT_STAT_C_RESET) << 16) /* * Returns 0 if the status hasn't changed, or the number of bytes in buf. * Ports are 0-indexed from the HCD point of view, * and 1-indexed from the USB core pointer of view. * * @buf: a bitmap to show which port status has been changed. * bit 0: reserved * bit 1: the status of port 0 has been changed. * bit 2: the status of port 1 has been changed. * ... */ static int vhci_hub_status(struct usb_hcd *hcd, char *buf) { struct vhci_hcd *vhci_hcd = hcd_to_vhci_hcd(hcd); struct vhci *vhci = vhci_hcd->vhci; int retval = DIV_ROUND_UP(VHCI_HC_PORTS + 1, 8); int rhport; int changed = 0; unsigned long flags; memset(buf, 0, retval); spin_lock_irqsave(&vhci->lock, flags); if (!HCD_HW_ACCESSIBLE(hcd)) { usbip_dbg_vhci_rh("hw accessible flag not on?\n"); goto done; } /* check pseudo status register for each port */ for (rhport = 0; rhport < VHCI_HC_PORTS; rhport++) { if ((vhci_hcd->port_status[rhport] & PORT_C_MASK)) { /* The status of a port has been changed, */ usbip_dbg_vhci_rh("port %d status changed\n", rhport); buf[(rhport + 1) / 8] |= 1 << (rhport + 1) % 8; changed = 1; } } if ((hcd->state == HC_STATE_SUSPENDED) && (changed == 1)) usb_hcd_resume_root_hub(hcd); done: spin_unlock_irqrestore(&vhci->lock, flags); return changed ? retval : 0; } /* usb 3.0 root hub device descriptor */ static struct { struct usb_bos_descriptor bos; struct usb_ss_cap_descriptor ss_cap; } __packed usb3_bos_desc = { .bos = { .bLength = USB_DT_BOS_SIZE, .bDescriptorType = USB_DT_BOS, .wTotalLength = cpu_to_le16(sizeof(usb3_bos_desc)), .bNumDeviceCaps = 1, }, .ss_cap = { .bLength = USB_DT_USB_SS_CAP_SIZE, .bDescriptorType = USB_DT_DEVICE_CAPABILITY, .bDevCapabilityType = USB_SS_CAP_TYPE, .wSpeedSupported = cpu_to_le16(USB_5GBPS_OPERATION), .bFunctionalitySupport = ilog2(USB_5GBPS_OPERATION), }, }; static inline void ss_hub_descriptor(struct usb_hub_descriptor *desc) { memset(desc, 0, sizeof *desc); desc->bDescriptorType = USB_DT_SS_HUB; desc->bDescLength = 12; desc->wHubCharacteristics = cpu_to_le16( HUB_CHAR_INDV_PORT_LPSM | HUB_CHAR_COMMON_OCPM); desc->bNbrPorts = VHCI_HC_PORTS; desc->u.ss.bHubHdrDecLat = 0x04; /* Worst case: 0.4 micro sec*/ desc->u.ss.DeviceRemovable = 0xffff; } static inline void hub_descriptor(struct usb_hub_descriptor *desc) { int width; memset(desc, 0, sizeof(*desc)); desc->bDescriptorType = USB_DT_HUB; desc->wHubCharacteristics = cpu_to_le16( HUB_CHAR_INDV_PORT_LPSM | HUB_CHAR_COMMON_OCPM); desc->bNbrPorts = VHCI_HC_PORTS; BUILD_BUG_ON(VHCI_HC_PORTS > USB_MAXCHILDREN); width = desc->bNbrPorts / 8 + 1; desc->bDescLength = USB_DT_HUB_NONVAR_SIZE + 2 * width; memset(&desc->u.hs.DeviceRemovable[0], 0, width); memset(&desc->u.hs.DeviceRemovable[width], 0xff, width); } static int vhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, u16 wIndex, char *buf, u16 wLength) { struct vhci_hcd *vhci_hcd; struct vhci *vhci; int retval = 0; int rhport = -1; unsigned long flags; bool invalid_rhport = false; u32 prev_port_status[VHCI_HC_PORTS]; if (!HCD_HW_ACCESSIBLE(hcd)) return -ETIMEDOUT; /* * NOTE: * wIndex (bits 0-7) shows the port number and begins from 1? */ wIndex = ((__u8)(wIndex & 0x00ff)); usbip_dbg_vhci_rh("typeReq %x wValue %x wIndex %x\n", typeReq, wValue, wIndex); /* * wIndex can be 0 for some request types (typeReq). rhport is * in valid range when wIndex >= 1 and < VHCI_HC_PORTS. * * Reference port_status[] only with valid rhport when * invalid_rhport is false. */ if (wIndex < 1 || wIndex > VHCI_HC_PORTS) { invalid_rhport = true; if (wIndex > VHCI_HC_PORTS) dev_err(hcd_dev(hcd), "invalid port number %d\n", wIndex); } else { rhport = wIndex - 1; } vhci_hcd = hcd_to_vhci_hcd(hcd); vhci = vhci_hcd->vhci; spin_lock_irqsave(&vhci->lock, flags); /* store old status and compare now and old later */ if (usbip_dbg_flag_vhci_rh) { if (!invalid_rhport) memcpy(prev_port_status, vhci_hcd->port_status, sizeof(prev_port_status)); } switch (typeReq) { case ClearHubFeature: usbip_dbg_vhci_rh(" ClearHubFeature\n"); break; case ClearPortFeature: if (invalid_rhport) { dev_err(hcd_dev(hcd), "invalid port number %d\n", wIndex); goto error; } switch (wValue) { case USB_PORT_FEAT_SUSPEND: if (hcd->speed >= HCD_USB3) { dev_err(hcd_dev(hcd), "ClearPortFeature: USB_PORT_FEAT_SUSPEND req not supported for USB 3.0 roothub\n"); goto error; } usbip_dbg_vhci_rh( " ClearPortFeature: USB_PORT_FEAT_SUSPEND\n"); if (vhci_hcd->port_status[rhport] & USB_PORT_STAT_SUSPEND) { /* 20msec signaling */ vhci_hcd->resuming = 1; vhci_hcd->re_timeout = jiffies + msecs_to_jiffies(20); } break; case USB_PORT_FEAT_POWER: usbip_dbg_vhci_rh( " ClearPortFeature: USB_PORT_FEAT_POWER\n"); if (hcd->speed >= HCD_USB3) vhci_hcd->port_status[rhport] &= ~USB_SS_PORT_STAT_POWER; else vhci_hcd->port_status[rhport] &= ~USB_PORT_STAT_POWER; break; default: usbip_dbg_vhci_rh(" ClearPortFeature: default %x\n", wValue); if (wValue >= 32) goto error; vhci_hcd->port_status[rhport] &= ~(1 << wValue); break; } break; case GetHubDescriptor: usbip_dbg_vhci_rh(" GetHubDescriptor\n"); if (hcd->speed >= HCD_USB3 && (wLength < USB_DT_SS_HUB_SIZE || wValue != (USB_DT_SS_HUB << 8))) { dev_err(hcd_dev(hcd), "Wrong hub descriptor type for USB 3.0 roothub.\n"); goto error; } if (hcd->speed >= HCD_USB3) ss_hub_descriptor((struct usb_hub_descriptor *) buf); else hub_descriptor((struct usb_hub_descriptor *) buf); break; case DeviceRequest | USB_REQ_GET_DESCRIPTOR: if (hcd->speed < HCD_USB3) goto error; if ((wValue >> 8) != USB_DT_BOS) goto error; memcpy(buf, &usb3_bos_desc, sizeof(usb3_bos_desc)); retval = sizeof(usb3_bos_desc); break; case GetHubStatus: usbip_dbg_vhci_rh(" GetHubStatus\n"); *(__le32 *) buf = cpu_to_le32(0); break; case GetPortStatus: usbip_dbg_vhci_rh(" GetPortStatus port %x\n", wIndex); if (invalid_rhport) { dev_err(hcd_dev(hcd), "invalid port number %d\n", wIndex); retval = -EPIPE; goto error; } /* we do not care about resume. */ /* whoever resets or resumes must GetPortStatus to * complete it!! */ if (vhci_hcd->resuming && time_after(jiffies, vhci_hcd->re_timeout)) { vhci_hcd->port_status[rhport] |= (1 << USB_PORT_FEAT_C_SUSPEND); vhci_hcd->port_status[rhport] &= ~(1 << USB_PORT_FEAT_SUSPEND); vhci_hcd->resuming = 0; vhci_hcd->re_timeout = 0; } if ((vhci_hcd->port_status[rhport] & (1 << USB_PORT_FEAT_RESET)) != 0 && time_after(jiffies, vhci_hcd->re_timeout)) { vhci_hcd->port_status[rhport] |= (1 << USB_PORT_FEAT_C_RESET); vhci_hcd->port_status[rhport] &= ~(1 << USB_PORT_FEAT_RESET); vhci_hcd->re_timeout = 0; /* * A few drivers do usb reset during probe when * the device could be in VDEV_ST_USED state */ if (vhci_hcd->vdev[rhport].ud.status == VDEV_ST_NOTASSIGNED || vhci_hcd->vdev[rhport].ud.status == VDEV_ST_USED) { usbip_dbg_vhci_rh( " enable rhport %d (status %u)\n", rhport, vhci_hcd->vdev[rhport].ud.status); vhci_hcd->port_status[rhport] |= USB_PORT_STAT_ENABLE; } if (hcd->speed < HCD_USB3) { switch (vhci_hcd->vdev[rhport].speed) { case USB_SPEED_HIGH: vhci_hcd->port_status[rhport] |= USB_PORT_STAT_HIGH_SPEED; break; case USB_SPEED_LOW: vhci_hcd->port_status[rhport] |= USB_PORT_STAT_LOW_SPEED; break; default: dev_err(hcd_dev(hcd), "vhci_device speed not set\n"); break; } } } ((__le16 *) buf)[0] = cpu_to_le16(vhci_hcd->port_status[rhport]); ((__le16 *) buf)[1] = cpu_to_le16(vhci_hcd->port_status[rhport] >> 16); usbip_dbg_vhci_rh(" GetPortStatus bye %x %x\n", ((u16 *)buf)[0], ((u16 *)buf)[1]); break; case SetHubFeature: usbip_dbg_vhci_rh(" SetHubFeature\n"); retval = -EPIPE; break; case SetPortFeature: switch (wValue) { case USB_PORT_FEAT_LINK_STATE: usbip_dbg_vhci_rh( " SetPortFeature: USB_PORT_FEAT_LINK_STATE\n"); if (hcd->speed < HCD_USB3) { dev_err(hcd_dev(hcd), "USB_PORT_FEAT_LINK_STATE req not supported for USB 2.0 roothub\n"); goto error; } /* * Since this is dummy we don't have an actual link so * there is nothing to do for the SET_LINK_STATE cmd */ break; case USB_PORT_FEAT_U1_TIMEOUT: usbip_dbg_vhci_rh( " SetPortFeature: USB_PORT_FEAT_U1_TIMEOUT\n"); fallthrough; case USB_PORT_FEAT_U2_TIMEOUT: usbip_dbg_vhci_rh( " SetPortFeature: USB_PORT_FEAT_U2_TIMEOUT\n"); /* TODO: add suspend/resume support! */ if (hcd->speed < HCD_USB3) { dev_err(hcd_dev(hcd), "USB_PORT_FEAT_U1/2_TIMEOUT req not supported for USB 2.0 roothub\n"); goto error; } break; case USB_PORT_FEAT_SUSPEND: usbip_dbg_vhci_rh( " SetPortFeature: USB_PORT_FEAT_SUSPEND\n"); /* Applicable only for USB2.0 hub */ if (hcd->speed >= HCD_USB3) { dev_err(hcd_dev(hcd), "USB_PORT_FEAT_SUSPEND req not supported for USB 3.0 roothub\n"); goto error; } if (invalid_rhport) { dev_err(hcd_dev(hcd), "invalid port number %d\n", wIndex); goto error; } vhci_hcd->port_status[rhport] |= USB_PORT_STAT_SUSPEND; break; case USB_PORT_FEAT_POWER: usbip_dbg_vhci_rh( " SetPortFeature: USB_PORT_FEAT_POWER\n"); if (invalid_rhport) { dev_err(hcd_dev(hcd), "invalid port number %d\n", wIndex); goto error; } if (hcd->speed >= HCD_USB3) vhci_hcd->port_status[rhport] |= USB_SS_PORT_STAT_POWER; else vhci_hcd->port_status[rhport] |= USB_PORT_STAT_POWER; break; case USB_PORT_FEAT_BH_PORT_RESET: usbip_dbg_vhci_rh( " SetPortFeature: USB_PORT_FEAT_BH_PORT_RESET\n"); if (invalid_rhport) { dev_err(hcd_dev(hcd), "invalid port number %d\n", wIndex); goto error; } /* Applicable only for USB3.0 hub */ if (hcd->speed < HCD_USB3) { dev_err(hcd_dev(hcd), "USB_PORT_FEAT_BH_PORT_RESET req not supported for USB 2.0 roothub\n"); goto error; } fallthrough; case USB_PORT_FEAT_RESET: usbip_dbg_vhci_rh( " SetPortFeature: USB_PORT_FEAT_RESET\n"); if (invalid_rhport) { dev_err(hcd_dev(hcd), "invalid port number %d\n", wIndex); goto error; } /* if it's already enabled, disable */ if (hcd->speed >= HCD_USB3) { vhci_hcd->port_status[rhport] = 0; vhci_hcd->port_status[rhport] = (USB_SS_PORT_STAT_POWER | USB_PORT_STAT_CONNECTION | USB_PORT_STAT_RESET); } else if (vhci_hcd->port_status[rhport] & USB_PORT_STAT_ENABLE) { vhci_hcd->port_status[rhport] &= ~(USB_PORT_STAT_ENABLE | USB_PORT_STAT_LOW_SPEED | USB_PORT_STAT_HIGH_SPEED); } /* 50msec reset signaling */ vhci_hcd->re_timeout = jiffies + msecs_to_jiffies(50); fallthrough; default: usbip_dbg_vhci_rh(" SetPortFeature: default %d\n", wValue); if (invalid_rhport) { dev_err(hcd_dev(hcd), "invalid port number %d\n", wIndex); goto error; } if (wValue >= 32) goto error; if (hcd->speed >= HCD_USB3) { if ((vhci_hcd->port_status[rhport] & USB_SS_PORT_STAT_POWER) != 0) { vhci_hcd->port_status[rhport] |= (1 << wValue); } } else if ((vhci_hcd->port_status[rhport] & USB_PORT_STAT_POWER) != 0) { vhci_hcd->port_status[rhport] |= (1 << wValue); } } break; case GetPortErrorCount: usbip_dbg_vhci_rh(" GetPortErrorCount\n"); if (hcd->speed < HCD_USB3) { dev_err(hcd_dev(hcd), "GetPortErrorCount req not supported for USB 2.0 roothub\n"); goto error; } /* We'll always return 0 since this is a dummy hub */ *(__le32 *) buf = cpu_to_le32(0); break; case SetHubDepth: usbip_dbg_vhci_rh(" SetHubDepth\n"); if (hcd->speed < HCD_USB3) { dev_err(hcd_dev(hcd), "SetHubDepth req not supported for USB 2.0 roothub\n"); goto error; } break; default: dev_err(hcd_dev(hcd), "default hub control req: %04x v%04x i%04x l%d\n", typeReq, wValue, wIndex, wLength); error: /* "protocol stall" on error */ retval = -EPIPE; } if (usbip_dbg_flag_vhci_rh) { dev_dbg(hcd_dev(hcd), "%s port %d\n", __func__, rhport); /* Only dump valid port status */ if (!invalid_rhport) { dump_port_status_diff(prev_port_status[rhport], vhci_hcd->port_status[rhport], hcd->speed >= HCD_USB3); } } usbip_dbg_vhci_rh(" bye\n"); spin_unlock_irqrestore(&vhci->lock, flags); if (!invalid_rhport && (vhci_hcd->port_status[rhport] & PORT_C_MASK) != 0) { usb_hcd_poll_rh_status(hcd); } return retval; } static void vhci_tx_urb(struct urb *urb, struct vhci_device *vdev) { struct vhci_priv *priv; struct vhci_hcd *vhci_hcd = vdev_to_vhci_hcd(vdev); unsigned long flags; priv = kzalloc(sizeof(struct vhci_priv), GFP_ATOMIC); if (!priv) { usbip_event_add(&vdev->ud, VDEV_EVENT_ERROR_MALLOC); return; } spin_lock_irqsave(&vdev->priv_lock, flags); priv->seqnum = (u32)atomic_inc_return(&vhci_hcd->seqnum); if (priv->seqnum == 0xffff) dev_info(&urb->dev->dev, "seqnum max\n"); priv->vdev = vdev; priv->urb = urb; urb->hcpriv = (void *) priv; list_add_tail(&priv->list, &vdev->priv_tx); wake_up(&vdev->waitq_tx); spin_unlock_irqrestore(&vdev->priv_lock, flags); } static int vhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flags) { struct vhci_hcd *vhci_hcd = hcd_to_vhci_hcd(hcd); struct vhci *vhci = vhci_hcd->vhci; struct device *dev = &urb->dev->dev; u8 portnum = urb->dev->portnum; int ret = 0; struct vhci_device *vdev; unsigned long flags; if (portnum > VHCI_HC_PORTS) { dev_err(hcd_dev(hcd), "invalid port number %d\n", portnum); return -ENODEV; } vdev = &vhci_hcd->vdev[portnum-1]; if (!urb->transfer_buffer && !urb->num_sgs && urb->transfer_buffer_length) { dev_dbg(dev, "Null URB transfer buffer\n"); return -EINVAL; } spin_lock_irqsave(&vhci->lock, flags); if (urb->status != -EINPROGRESS) { dev_err(dev, "URB already unlinked!, status %d\n", urb->status); spin_unlock_irqrestore(&vhci->lock, flags); return urb->status; } /* refuse enqueue for dead connection */ spin_lock(&vdev->ud.lock); if (vdev->ud.status == VDEV_ST_NULL || vdev->ud.status == VDEV_ST_ERROR) { dev_err(dev, "enqueue for inactive port %d\n", vdev->rhport); spin_unlock(&vdev->ud.lock); spin_unlock_irqrestore(&vhci->lock, flags); return -ENODEV; } spin_unlock(&vdev->ud.lock); ret = usb_hcd_link_urb_to_ep(hcd, urb); if (ret) goto no_need_unlink; /* * The enumeration process is as follows; * * 1. Get_Descriptor request to DevAddrs(0) EndPoint(0) * to get max packet length of default pipe * * 2. Set_Address request to DevAddr(0) EndPoint(0) * */ if (usb_pipedevice(urb->pipe) == 0) { struct usb_device *old; __u8 type = usb_pipetype(urb->pipe); struct usb_ctrlrequest *ctrlreq = (struct usb_ctrlrequest *) urb->setup_packet; if (type != PIPE_CONTROL || !ctrlreq) { dev_err(dev, "invalid request to devnum 0\n"); ret = -EINVAL; goto no_need_xmit; } old = vdev->udev; switch (ctrlreq->bRequest) { case USB_REQ_SET_ADDRESS: /* set_address may come when a device is reset */ dev_info(dev, "SetAddress Request (%d) to port %d\n", ctrlreq->wValue, vdev->rhport); vdev->udev = usb_get_dev(urb->dev); /* * NOTE: A similar operation has been done via * USB_REQ_GET_DESCRIPTOR handler below, which is * supposed to always precede USB_REQ_SET_ADDRESS. * * It's not entirely clear if operating on a different * usb_device instance here is a real possibility, * otherwise this call and vdev->udev assignment above * should be dropped. */ dev_pm_syscore_device(&vdev->udev->dev, true); usb_put_dev(old); spin_lock(&vdev->ud.lock); vdev->ud.status = VDEV_ST_USED; spin_unlock(&vdev->ud.lock); if (urb->status == -EINPROGRESS) { /* This request is successfully completed. */ /* If not -EINPROGRESS, possibly unlinked. */ urb->status = 0; } goto no_need_xmit; case USB_REQ_GET_DESCRIPTOR: if (ctrlreq->wValue == cpu_to_le16(USB_DT_DEVICE << 8)) usbip_dbg_vhci_hc( "Not yet?:Get_Descriptor to device 0 (get max pipe size)\n"); vdev->udev = usb_get_dev(urb->dev); /* * Set syscore PM flag for the virtually attached * devices to ensure they will not enter suspend on * the client side. * * Note this doesn't have any impact on the physical * devices attached to the host system on the server * side, hence there is no need to undo the operation * on disconnect. */ dev_pm_syscore_device(&vdev->udev->dev, true); usb_put_dev(old); goto out; default: /* NOT REACHED */ dev_err(dev, "invalid request to devnum 0 bRequest %u, wValue %u\n", ctrlreq->bRequest, ctrlreq->wValue); ret = -EINVAL; goto no_need_xmit; } } out: vhci_tx_urb(urb, vdev); spin_unlock_irqrestore(&vhci->lock, flags); return 0; no_need_xmit: usb_hcd_unlink_urb_from_ep(hcd, urb); no_need_unlink: if (!ret) { /* usb_hcd_giveback_urb() should be called with * irqs disabled */ spin_unlock(&vhci->lock); usb_hcd_giveback_urb(hcd, urb, urb->status); spin_lock(&vhci->lock); } spin_unlock_irqrestore(&vhci->lock, flags); return ret; } /* * vhci_rx gives back the urb after receiving the reply of the urb. If an * unlink pdu is sent or not, vhci_rx receives a normal return pdu and gives * back its urb. For the driver unlinking the urb, the content of the urb is * not important, but the calling to its completion handler is important; the * completion of unlinking is notified by the completion handler. * * * CLIENT SIDE * * - When vhci_hcd receives RET_SUBMIT, * * - case 1a). the urb of the pdu is not unlinking. * - normal case * => just give back the urb * * - case 1b). the urb of the pdu is unlinking. * - usbip.ko will return a reply of the unlinking request. * => give back the urb now and go to case 2b). * * - When vhci_hcd receives RET_UNLINK, * * - case 2a). a submit request is still pending in vhci_hcd. * - urb was really pending in usbip.ko and urb_unlink_urb() was * completed there. * => free a pending submit request * => notify unlink completeness by giving back the urb * * - case 2b). a submit request is *not* pending in vhci_hcd. * - urb was already given back to the core driver. * => do not give back the urb * * * SERVER SIDE * * - When usbip receives CMD_UNLINK, * * - case 3a). the urb of the unlink request is now in submission. * => do usb_unlink_urb(). * => after the unlink is completed, send RET_UNLINK. * * - case 3b). the urb of the unlink request is not in submission. * - may be already completed or never be received * => send RET_UNLINK * */ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status) { struct vhci_hcd *vhci_hcd = hcd_to_vhci_hcd(hcd); struct vhci *vhci = vhci_hcd->vhci; struct vhci_priv *priv; struct vhci_device *vdev; unsigned long flags; spin_lock_irqsave(&vhci->lock, flags); priv = urb->hcpriv; if (!priv) { /* URB was never linked! or will be soon given back by * vhci_rx. */ spin_unlock_irqrestore(&vhci->lock, flags); return -EIDRM; } { int ret = 0; ret = usb_hcd_check_unlink_urb(hcd, urb, status); if (ret) { spin_unlock_irqrestore(&vhci->lock, flags); return ret; } } /* send unlink request here? */ vdev = priv->vdev; if (!vdev->ud.tcp_socket) { /* tcp connection is closed */ spin_lock(&vdev->priv_lock); list_del(&priv->list); kfree(priv); urb->hcpriv = NULL; spin_unlock(&vdev->priv_lock); /* * If tcp connection is alive, we have sent CMD_UNLINK. * vhci_rx will receive RET_UNLINK and give back the URB. * Otherwise, we give back it here. */ usb_hcd_unlink_urb_from_ep(hcd, urb); spin_unlock_irqrestore(&vhci->lock, flags); usb_hcd_giveback_urb(hcd, urb, urb->status); spin_lock_irqsave(&vhci->lock, flags); } else { /* tcp connection is alive */ struct vhci_unlink *unlink; spin_lock(&vdev->priv_lock); /* setup CMD_UNLINK pdu */ unlink = kzalloc(sizeof(struct vhci_unlink), GFP_ATOMIC); if (!unlink) { spin_unlock(&vdev->priv_lock); spin_unlock_irqrestore(&vhci->lock, flags); usbip_event_add(&vdev->ud, VDEV_EVENT_ERROR_MALLOC); return -ENOMEM; } unlink->seqnum = atomic_inc_return(&vhci_hcd->seqnum); if (unlink->seqnum == 0xffff) dev_info(hcd_dev(hcd), "seqnum max\n"); unlink->unlink_seqnum = priv->seqnum; /* send cmd_unlink and try to cancel the pending URB in the * peer */ list_add_tail(&unlink->list, &vdev->unlink_tx); wake_up(&vdev->waitq_tx); spin_unlock(&vdev->priv_lock); } spin_unlock_irqrestore(&vhci->lock, flags); usbip_dbg_vhci_hc("leave\n"); return 0; } static void vhci_cleanup_unlink_list(struct vhci_device *vdev, struct list_head *unlink_list) { struct vhci_hcd *vhci_hcd = vdev_to_vhci_hcd(vdev); struct usb_hcd *hcd = vhci_hcd_to_hcd(vhci_hcd); struct vhci *vhci = vhci_hcd->vhci; struct vhci_unlink *unlink, *tmp; unsigned long flags; spin_lock_irqsave(&vhci->lock, flags); spin_lock(&vdev->priv_lock); list_for_each_entry_safe(unlink, tmp, unlink_list, list) { struct urb *urb; urb = pickup_urb_and_free_priv(vdev, unlink->unlink_seqnum); if (!urb) { list_del(&unlink->list); kfree(unlink); continue; } urb->status = -ENODEV; usb_hcd_unlink_urb_from_ep(hcd, urb); list_del(&unlink->list); spin_unlock(&vdev->priv_lock); spin_unlock_irqrestore(&vhci->lock, flags); usb_hcd_giveback_urb(hcd, urb, urb->status); spin_lock_irqsave(&vhci->lock, flags); spin_lock(&vdev->priv_lock); kfree(unlink); } spin_unlock(&vdev->priv_lock); spin_unlock_irqrestore(&vhci->lock, flags); } static void vhci_device_unlink_cleanup(struct vhci_device *vdev) { /* give back URB of unsent unlink request */ vhci_cleanup_unlink_list(vdev, &vdev->unlink_tx); /* give back URB of unanswered unlink request */ vhci_cleanup_unlink_list(vdev, &vdev->unlink_rx); } /* * The important thing is that only one context begins cleanup. * This is why error handling and cleanup become simple. * We do not want to consider race condition as possible. */ static void vhci_shutdown_connection(struct usbip_device *ud) { struct vhci_device *vdev = container_of(ud, struct vhci_device, ud); struct usb_hcd *hcd = vhci_hcd_to_hcd(vdev_to_vhci_hcd(vdev)); /* need this? see stub_dev.c */ if (ud->tcp_socket) { dev_dbg(hcd_dev(hcd), "shutdown tcp_socket %d\n", ud->sockfd); kernel_sock_shutdown(ud->tcp_socket, SHUT_RDWR); } /* kill threads related to this sdev */ if (vdev->ud.tcp_rx) { kthread_stop_put(vdev->ud.tcp_rx); vdev->ud.tcp_rx = NULL; } if (vdev->ud.tcp_tx) { kthread_stop_put(vdev->ud.tcp_tx); vdev->ud.tcp_tx = NULL; } dev_info(hcd_dev(hcd), "stop threads\n"); /* active connection is closed */ if (vdev->ud.tcp_socket) { sockfd_put(vdev->ud.tcp_socket); vdev->ud.tcp_socket = NULL; vdev->ud.sockfd = -1; } dev_info(hcd_dev(hcd), "release socket\n"); vhci_device_unlink_cleanup(vdev); /* * rh_port_disconnect() is a trigger of ... * usb_disable_device(): * disable all the endpoints for a USB device. * usb_disable_endpoint(): * disable endpoints. pending urbs are unlinked(dequeued). * * NOTE: After calling rh_port_disconnect(), the USB device drivers of a * detached device should release used urbs in a cleanup function (i.e. * xxx_disconnect()). Therefore, vhci_hcd does not need to release * pushed urbs and their private data in this function. * * NOTE: vhci_dequeue() must be considered carefully. When shutting down * a connection, vhci_shutdown_connection() expects vhci_dequeue() * gives back pushed urbs and frees their private data by request of * the cleanup function of a USB driver. When unlinking a urb with an * active connection, vhci_dequeue() does not give back the urb which * is actually given back by vhci_rx after receiving its return pdu. * */ rh_port_disconnect(vdev); dev_info(hcd_dev(hcd), "disconnect device\n"); } static void vhci_device_reset(struct usbip_device *ud) { struct vhci_device *vdev = container_of(ud, struct vhci_device, ud); struct usb_device *old = vdev->udev; unsigned long flags; spin_lock_irqsave(&ud->lock, flags); vdev->speed = 0; vdev->devid = 0; vdev->udev = NULL; usb_put_dev(old); if (ud->tcp_socket) { sockfd_put(ud->tcp_socket); ud->tcp_socket = NULL; ud->sockfd = -1; } ud->status = VDEV_ST_NULL; spin_unlock_irqrestore(&ud->lock, flags); } static void vhci_device_unusable(struct usbip_device *ud) { unsigned long flags; spin_lock_irqsave(&ud->lock, flags); ud->status = VDEV_ST_ERROR; spin_unlock_irqrestore(&ud->lock, flags); } static void vhci_device_init(struct vhci_device *vdev) { memset(vdev, 0, sizeof(struct vhci_device)); vdev->ud.side = USBIP_VHCI; vdev->ud.status = VDEV_ST_NULL; spin_lock_init(&vdev->ud.lock); mutex_init(&vdev->ud.sysfs_lock); INIT_LIST_HEAD(&vdev->priv_rx); INIT_LIST_HEAD(&vdev->priv_tx); INIT_LIST_HEAD(&vdev->unlink_tx); INIT_LIST_HEAD(&vdev->unlink_rx); spin_lock_init(&vdev->priv_lock); init_waitqueue_head(&vdev->waitq_tx); vdev->ud.eh_ops.shutdown = vhci_shutdown_connection; vdev->ud.eh_ops.reset = vhci_device_reset; vdev->ud.eh_ops.unusable = vhci_device_unusable; usbip_start_eh(&vdev->ud); } static int hcd_name_to_id(const char *name) { char *c; long val; int ret; c = strchr(name, '.'); if (c == NULL) return 0; ret = kstrtol(c+1, 10, &val); if (ret < 0) return ret; return val; } static int vhci_setup(struct usb_hcd *hcd) { struct vhci *vhci = *((void **)dev_get_platdata(hcd->self.controller)); if (usb_hcd_is_primary_hcd(hcd)) { vhci->vhci_hcd_hs = hcd_to_vhci_hcd(hcd); vhci->vhci_hcd_hs->vhci = vhci; /* * Mark the first roothub as being USB 2.0. * The USB 3.0 roothub will be registered later by * vhci_hcd_probe() */ hcd->speed = HCD_USB2; hcd->self.root_hub->speed = USB_SPEED_HIGH; } else { vhci->vhci_hcd_ss = hcd_to_vhci_hcd(hcd); vhci->vhci_hcd_ss->vhci = vhci; hcd->speed = HCD_USB31; hcd->self.root_hub->speed = USB_SPEED_SUPER_PLUS; } /* accept arbitrarily long scatter-gather lists */ hcd->self.sg_tablesize = ~0; hcd->self.no_sg_constraint = 1; return 0; } static int vhci_start(struct usb_hcd *hcd) { struct vhci_hcd *vhci_hcd = hcd_to_vhci_hcd(hcd); int id, rhport; int err; usbip_dbg_vhci_hc("enter vhci_start\n"); if (usb_hcd_is_primary_hcd(hcd)) spin_lock_init(&vhci_hcd->vhci->lock); /* initialize private data of usb_hcd */ for (rhport = 0; rhport < VHCI_HC_PORTS; rhport++) { struct vhci_device *vdev = &vhci_hcd->vdev[rhport]; vhci_device_init(vdev); vdev->rhport = rhport; } atomic_set(&vhci_hcd->seqnum, 0); hcd->power_budget = 0; /* no limit */ hcd->uses_new_polling = 1; #ifdef CONFIG_USB_OTG hcd->self.otg_port = 1; #endif id = hcd_name_to_id(hcd_name(hcd)); if (id < 0) { dev_err(hcd_dev(hcd), "invalid vhci name %s\n", hcd_name(hcd)); return -EINVAL; } /* vhci_hcd is now ready to be controlled through sysfs */ if (id == 0 && usb_hcd_is_primary_hcd(hcd)) { err = vhci_init_attr_group(); if (err) { dev_err(hcd_dev(hcd), "init attr group failed, err = %d\n", err); return err; } err = sysfs_create_group(&hcd_dev(hcd)->kobj, &vhci_attr_group); if (err) { dev_err(hcd_dev(hcd), "create sysfs files failed, err = %d\n", err); vhci_finish_attr_group(); return err; } dev_info(hcd_dev(hcd), "created sysfs %s\n", hcd_name(hcd)); } return 0; } static void vhci_stop(struct usb_hcd *hcd) { struct vhci_hcd *vhci_hcd = hcd_to_vhci_hcd(hcd); int id, rhport; usbip_dbg_vhci_hc("stop VHCI controller\n"); /* 1. remove the userland interface of vhci_hcd */ id = hcd_name_to_id(hcd_name(hcd)); if (id == 0 && usb_hcd_is_primary_hcd(hcd)) { sysfs_remove_group(&hcd_dev(hcd)->kobj, &vhci_attr_group); vhci_finish_attr_group(); } /* 2. shutdown all the ports of vhci_hcd */ for (rhport = 0; rhport < VHCI_HC_PORTS; rhport++) { struct vhci_device *vdev = &vhci_hcd->vdev[rhport]; usbip_event_add(&vdev->ud, VDEV_EVENT_REMOVED); usbip_stop_eh(&vdev->ud); } } static int vhci_get_frame_number(struct usb_hcd *hcd) { dev_err_ratelimited(&hcd->self.root_hub->dev, "Not yet implemented\n"); return 0; } #ifdef CONFIG_PM /* FIXME: suspend/resume */ static int vhci_bus_suspend(struct usb_hcd *hcd) { struct vhci *vhci = *((void **)dev_get_platdata(hcd->self.controller)); unsigned long flags; dev_dbg(&hcd->self.root_hub->dev, "%s\n", __func__); spin_lock_irqsave(&vhci->lock, flags); hcd->state = HC_STATE_SUSPENDED; spin_unlock_irqrestore(&vhci->lock, flags); return 0; } static int vhci_bus_resume(struct usb_hcd *hcd) { struct vhci *vhci = *((void **)dev_get_platdata(hcd->self.controller)); int rc = 0; unsigned long flags; dev_dbg(&hcd->self.root_hub->dev, "%s\n", __func__); spin_lock_irqsave(&vhci->lock, flags); if (!HCD_HW_ACCESSIBLE(hcd)) rc = -ESHUTDOWN; else hcd->state = HC_STATE_RUNNING; spin_unlock_irqrestore(&vhci->lock, flags); return rc; } #else #define vhci_bus_suspend NULL #define vhci_bus_resume NULL #endif /* Change a group of bulk endpoints to support multiple stream IDs */ static int vhci_alloc_streams(struct usb_hcd *hcd, struct usb_device *udev, struct usb_host_endpoint **eps, unsigned int num_eps, unsigned int num_streams, gfp_t mem_flags) { dev_dbg(&hcd->self.root_hub->dev, "vhci_alloc_streams not implemented\n"); return 0; } /* Reverts a group of bulk endpoints back to not using stream IDs. */ static int vhci_free_streams(struct usb_hcd *hcd, struct usb_device *udev, struct usb_host_endpoint **eps, unsigned int num_eps, gfp_t mem_flags) { dev_dbg(&hcd->self.root_hub->dev, "vhci_free_streams not implemented\n"); return 0; } static const struct hc_driver vhci_hc_driver = { .description = driver_name, .product_desc = driver_desc, .hcd_priv_size = sizeof(struct vhci_hcd), .flags = HCD_USB31 | HCD_SHARED, .reset = vhci_setup, .start = vhci_start, .stop = vhci_stop, .urb_enqueue = vhci_urb_enqueue, .urb_dequeue = vhci_urb_dequeue, .get_frame_number = vhci_get_frame_number, .hub_status_data = vhci_hub_status, .hub_control = vhci_hub_control, .bus_suspend = vhci_bus_suspend, .bus_resume = vhci_bus_resume, .alloc_streams = vhci_alloc_streams, .free_streams = vhci_free_streams, }; static int vhci_hcd_probe(struct platform_device *pdev) { struct vhci *vhci = *((void **)dev_get_platdata(&pdev->dev)); struct usb_hcd *hcd_hs; struct usb_hcd *hcd_ss; int ret; usbip_dbg_vhci_hc("name %s id %d\n", pdev->name, pdev->id); /* * Allocate and initialize hcd. * Our private data is also allocated automatically. */ hcd_hs = usb_create_hcd(&vhci_hc_driver, &pdev->dev, dev_name(&pdev->dev)); if (!hcd_hs) return dev_err_probe(&pdev->dev, -ENOMEM, "create primary hcd failed\n"); hcd_hs->has_tt = 1; /* * Finish generic HCD structure initialization and register. * Call the driver's reset() and start() routines. */ ret = usb_add_hcd(hcd_hs, 0, 0); if (ret) { dev_err_probe(&pdev->dev, ret, "usb_add_hcd hs failed\n"); goto put_usb2_hcd; } hcd_ss = usb_create_shared_hcd(&vhci_hc_driver, &pdev->dev, dev_name(&pdev->dev), hcd_hs); if (!hcd_ss) { ret = dev_err_probe(&pdev->dev, -ENOMEM, "create shared hcd failed\n"); goto remove_usb2_hcd; } ret = usb_add_hcd(hcd_ss, 0, 0); if (ret) { dev_err_probe(&pdev->dev, ret, "usb_add_hcd ss failed\n"); goto put_usb3_hcd; } usbip_dbg_vhci_hc("bye\n"); return 0; put_usb3_hcd: usb_put_hcd(hcd_ss); remove_usb2_hcd: usb_remove_hcd(hcd_hs); put_usb2_hcd: usb_put_hcd(hcd_hs); vhci->vhci_hcd_hs = NULL; vhci->vhci_hcd_ss = NULL; return ret; } static void vhci_hcd_remove(struct platform_device *pdev) { struct vhci *vhci = *((void **)dev_get_platdata(&pdev->dev)); /* * Disconnects the root hub, * then reverses the effects of usb_add_hcd(), * invoking the HCD's stop() methods. */ usb_remove_hcd(vhci_hcd_to_hcd(vhci->vhci_hcd_ss)); usb_put_hcd(vhci_hcd_to_hcd(vhci->vhci_hcd_ss)); usb_remove_hcd(vhci_hcd_to_hcd(vhci->vhci_hcd_hs)); usb_put_hcd(vhci_hcd_to_hcd(vhci->vhci_hcd_hs)); vhci->vhci_hcd_hs = NULL; vhci->vhci_hcd_ss = NULL; } #ifdef CONFIG_PM /* what should happen for USB/IP under suspend/resume? */ static int vhci_hcd_suspend(struct platform_device *pdev, pm_message_t state) { struct usb_hcd *hcd; struct vhci *vhci; int rhport; int connected = 0; int ret = 0; unsigned long flags; dev_dbg(&pdev->dev, "%s\n", __func__); hcd = platform_get_drvdata(pdev); if (!hcd) return 0; vhci = *((void **)dev_get_platdata(hcd->self.controller)); spin_lock_irqsave(&vhci->lock, flags); for (rhport = 0; rhport < VHCI_HC_PORTS; rhport++) { if (vhci->vhci_hcd_hs->port_status[rhport] & USB_PORT_STAT_CONNECTION) connected += 1; if (vhci->vhci_hcd_ss->port_status[rhport] & USB_PORT_STAT_CONNECTION) connected += 1; } spin_unlock_irqrestore(&vhci->lock, flags); if (connected > 0) { dev_info(&pdev->dev, "We have %d active connection%s. Do not suspend.\n", connected, str_plural(connected)); ret = -EBUSY; } else { dev_info(&pdev->dev, "suspend vhci_hcd"); clear_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags); } return ret; } static int vhci_hcd_resume(struct platform_device *pdev) { struct usb_hcd *hcd; dev_dbg(&pdev->dev, "%s\n", __func__); hcd = platform_get_drvdata(pdev); if (!hcd) return 0; set_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags); usb_hcd_poll_rh_status(hcd); return 0; } #else #define vhci_hcd_suspend NULL #define vhci_hcd_resume NULL #endif static struct platform_driver vhci_driver = { .probe = vhci_hcd_probe, .remove = vhci_hcd_remove, .suspend = vhci_hcd_suspend, .resume = vhci_hcd_resume, .driver = { .name = driver_name, }, }; static void del_platform_devices(void) { int i; for (i = 0; i < vhci_num_controllers; i++) { platform_device_unregister(vhcis[i].pdev); vhcis[i].pdev = NULL; } sysfs_remove_link(&platform_bus.kobj, driver_name); } static int __init vhci_hcd_init(void) { int i, ret; if (usb_disabled()) return -ENODEV; if (vhci_num_controllers < 1) vhci_num_controllers = 1; vhcis = kcalloc(vhci_num_controllers, sizeof(struct vhci), GFP_KERNEL); if (vhcis == NULL) return -ENOMEM; ret = platform_driver_register(&vhci_driver); if (ret) goto err_driver_register; for (i = 0; i < vhci_num_controllers; i++) { void *vhci = &vhcis[i]; struct platform_device_info pdevinfo = { .name = driver_name, .id = i, .data = &vhci, .size_data = sizeof(void *), }; vhcis[i].pdev = platform_device_register_full(&pdevinfo); ret = PTR_ERR_OR_ZERO(vhcis[i].pdev); if (ret < 0) { while (i--) platform_device_unregister(vhcis[i].pdev); goto err_add_hcd; } } return 0; err_add_hcd: platform_driver_unregister(&vhci_driver); err_driver_register: kfree(vhcis); return ret; } static void __exit vhci_hcd_exit(void) { del_platform_devices(); platform_driver_unregister(&vhci_driver); kfree(vhcis); } module_init(vhci_hcd_init); module_exit(vhci_hcd_exit); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL");
3 3 3 3 1 3 2 1 1 2 2 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" #include <linux/phy.h> #include <linux/phylib_stubs.h> struct linkstate_req_info { struct ethnl_req_info base; }; struct linkstate_reply_data { struct ethnl_reply_data base; int link; int sqi; int sqi_max; struct ethtool_link_ext_stats link_stats; bool link_ext_state_provided; struct ethtool_link_ext_state_info ethtool_link_ext_state_info; }; #define LINKSTATE_REPDATA(__reply_base) \ container_of(__reply_base, struct linkstate_reply_data, base) const struct nla_policy ethnl_linkstate_get_policy[] = { [ETHTOOL_A_LINKSTATE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_stats), }; static int linkstate_get_sqi(struct phy_device *phydev) { int ret; if (!phydev) return -EOPNOTSUPP; mutex_lock(&phydev->lock); if (!phydev->drv || !phydev->drv->get_sqi) ret = -EOPNOTSUPP; else if (!phydev->link) ret = -ENETDOWN; else ret = phydev->drv->get_sqi(phydev); mutex_unlock(&phydev->lock); return ret; } static int linkstate_get_sqi_max(struct phy_device *phydev) { int ret; if (!phydev) return -EOPNOTSUPP; mutex_lock(&phydev->lock); if (!phydev->drv || !phydev->drv->get_sqi_max) ret = -EOPNOTSUPP; else if (!phydev->link) ret = -ENETDOWN; else ret = phydev->drv->get_sqi_max(phydev); mutex_unlock(&phydev->lock); return ret; }; static bool linkstate_sqi_critical_error(int sqi) { return sqi < 0 && sqi != -EOPNOTSUPP && sqi != -ENETDOWN; } static bool linkstate_sqi_valid(struct linkstate_reply_data *data) { return data->sqi >= 0 && data->sqi_max >= 0 && data->sqi <= data->sqi_max; } static int linkstate_get_link_ext_state(struct net_device *dev, struct linkstate_reply_data *data) { int err; if (!dev->ethtool_ops->get_link_ext_state) return -EOPNOTSUPP; err = dev->ethtool_ops->get_link_ext_state(dev, &data->ethtool_link_ext_state_info); if (err) return err; data->link_ext_state_provided = true; return 0; } static int linkstate_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct linkstate_reply_data *data = LINKSTATE_REPDATA(reply_base); struct net_device *dev = reply_base->dev; struct nlattr **tb = info->attrs; struct phy_device *phydev; int ret; phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_LINKSTATE_HEADER, info->extack); if (IS_ERR(phydev)) { ret = PTR_ERR(phydev); goto out; } ret = ethnl_ops_begin(dev); if (ret < 0) return ret; data->link = __ethtool_get_link(dev); ret = linkstate_get_sqi(phydev); if (linkstate_sqi_critical_error(ret)) goto out; data->sqi = ret; ret = linkstate_get_sqi_max(phydev); if (linkstate_sqi_critical_error(ret)) goto out; data->sqi_max = ret; if (dev->flags & IFF_UP) { ret = linkstate_get_link_ext_state(dev, data); if (ret < 0 && ret != -EOPNOTSUPP && ret != -ENODATA) goto out; } ethtool_stats_init((u64 *)&data->link_stats, sizeof(data->link_stats) / 8); if (req_base->flags & ETHTOOL_FLAG_STATS) { if (phydev) phy_ethtool_get_link_ext_stats(phydev, &data->link_stats); if (dev->ethtool_ops->get_link_ext_stats) dev->ethtool_ops->get_link_ext_stats(dev, &data->link_stats); } ret = 0; out: ethnl_ops_complete(dev); return ret; } static int linkstate_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { struct linkstate_reply_data *data = LINKSTATE_REPDATA(reply_base); int len; len = nla_total_size(sizeof(u8)) /* LINKSTATE_LINK */ + 0; if (linkstate_sqi_valid(data)) { len += nla_total_size(sizeof(u32)); /* LINKSTATE_SQI */ len += nla_total_size(sizeof(u32)); /* LINKSTATE_SQI_MAX */ } if (data->link_ext_state_provided) len += nla_total_size(sizeof(u8)); /* LINKSTATE_EXT_STATE */ if (data->ethtool_link_ext_state_info.__link_ext_substate) len += nla_total_size(sizeof(u8)); /* LINKSTATE_EXT_SUBSTATE */ if (data->link_stats.link_down_events != ETHTOOL_STAT_NOT_SET) len += nla_total_size(sizeof(u32)); return len; } static int linkstate_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { struct linkstate_reply_data *data = LINKSTATE_REPDATA(reply_base); if (data->link >= 0 && nla_put_u8(skb, ETHTOOL_A_LINKSTATE_LINK, !!data->link)) return -EMSGSIZE; if (linkstate_sqi_valid(data)) { if (nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI, data->sqi)) return -EMSGSIZE; if (nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI_MAX, data->sqi_max)) return -EMSGSIZE; } if (data->link_ext_state_provided) { if (nla_put_u8(skb, ETHTOOL_A_LINKSTATE_EXT_STATE, data->ethtool_link_ext_state_info.link_ext_state)) return -EMSGSIZE; if (data->ethtool_link_ext_state_info.__link_ext_substate && nla_put_u8(skb, ETHTOOL_A_LINKSTATE_EXT_SUBSTATE, data->ethtool_link_ext_state_info.__link_ext_substate)) return -EMSGSIZE; } if (data->link_stats.link_down_events != ETHTOOL_STAT_NOT_SET) if (nla_put_u32(skb, ETHTOOL_A_LINKSTATE_EXT_DOWN_CNT, data->link_stats.link_down_events)) return -EMSGSIZE; return 0; } const struct ethnl_request_ops ethnl_linkstate_request_ops = { .request_cmd = ETHTOOL_MSG_LINKSTATE_GET, .reply_cmd = ETHTOOL_MSG_LINKSTATE_GET_REPLY, .hdr_attr = ETHTOOL_A_LINKSTATE_HEADER, .req_info_size = sizeof(struct linkstate_req_info), .reply_data_size = sizeof(struct linkstate_reply_data), .prepare_data = linkstate_prepare_data, .reply_size = linkstate_reply_size, .fill_reply = linkstate_fill_reply, };
35 35 35 35 13 5 21 35 11 1 1 1 49 14 35 1 1 1 1 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 /* * Copyright (c) 2006-2008 Intel Corporation * Copyright (c) 2007 Dave Airlie <airlied@linux.ie> * Copyright (c) 2008 Red Hat Inc. * Copyright (c) 2016 Intel Corporation * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that * the above copyright notice appear in all copies and that both that copyright * notice and this permission notice appear in supporting documentation, and * that the name of the copyright holders not be used in advertising or * publicity pertaining to distribution of the software without specific, * written prior permission. The copyright holders make no representations * about the suitability of this software for any purpose. It is provided "as * is" without express or implied warranty. * * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THIS SOFTWARE. */ #include <drm/drm_device.h> #include <drm/drm_drv.h> #include <drm/drm_dumb_buffers.h> #include <drm/drm_fourcc.h> #include <drm/drm_gem.h> #include <drm/drm_mode.h> #include <drm/drm_print.h> #include "drm_crtc_internal.h" #include "drm_internal.h" /** * DOC: overview * * The KMS API doesn't standardize backing storage object creation and leaves it * to driver-specific ioctls. Furthermore actually creating a buffer object even * for GEM-based drivers is done through a driver-specific ioctl - GEM only has * a common userspace interface for sharing and destroying objects. While not an * issue for full-fledged graphics stacks that include device-specific userspace * components (in libdrm for instance), this limit makes DRM-based early boot * graphics unnecessarily complex. * * Dumb objects partly alleviate the problem by providing a standard API to * create dumb buffers suitable for scanout, which can then be used to create * KMS frame buffers. * * To support dumb objects drivers must implement the &drm_driver.dumb_create * and &drm_driver.dumb_map_offset operations (the latter defaults to * drm_gem_dumb_map_offset() if not set). Drivers that don't use GEM handles * additionally need to implement the &drm_driver.dumb_destroy operation. See * the callbacks for further details. * * Note that dumb objects may not be used for gpu acceleration, as has been * attempted on some ARM embedded platforms. Such drivers really must have * a hardware-specific ioctl to allocate suitable buffer objects. */ static int drm_mode_align_dumb(struct drm_mode_create_dumb *args, unsigned long hw_pitch_align, unsigned long hw_size_align) { u32 pitch = args->pitch; u32 size; if (!pitch) return -EINVAL; if (hw_pitch_align) pitch = roundup(pitch, hw_pitch_align); if (!hw_size_align) hw_size_align = PAGE_SIZE; else if (!IS_ALIGNED(hw_size_align, PAGE_SIZE)) return -EINVAL; /* TODO: handle this if necessary */ if (check_mul_overflow(args->height, pitch, &size)) return -EINVAL; size = ALIGN(size, hw_size_align); if (!size) return -EINVAL; args->pitch = pitch; args->size = size; return 0; } /** * drm_mode_size_dumb - Calculates the scanline and buffer sizes for dumb buffers * @dev: DRM device * @args: Parameters for the dumb buffer * @hw_pitch_align: Hardware scanline alignment in bytes * @hw_size_align: Hardware buffer-size alignment in bytes * * The helper drm_mode_size_dumb() calculates the size of the buffer * allocation and the scanline size for a dumb buffer. Callers have to * set the buffers width, height and color mode in the argument @arg. * The helper validates the correctness of the input and tests for * possible overflows. If successful, it returns the dumb buffer's * required scanline pitch and size in &args. * * The parameter @hw_pitch_align allows the driver to specifies an * alignment for the scanline pitch, if the hardware requires any. The * calculated pitch will be a multiple of the alignment. The parameter * @hw_size_align allows to specify an alignment for buffer sizes. The * provided alignment should represent requirements of the graphics * hardware. drm_mode_size_dumb() handles GEM-related constraints * automatically across all drivers and hardware. For example, the * returned buffer size is always a multiple of PAGE_SIZE, which is * required by mmap(). * * Returns: * Zero on success, or a negative error code otherwise. */ int drm_mode_size_dumb(struct drm_device *dev, struct drm_mode_create_dumb *args, unsigned long hw_pitch_align, unsigned long hw_size_align) { u64 pitch = 0; u32 fourcc; /* * The scanline pitch depends on the buffer width and the color * format. The latter is specified as a color-mode constant for * which we first have to find the corresponding color format. * * Different color formats can have the same color-mode constant. * For example XRGB8888 and BGRX8888 both have a color mode of 32. * It is possible to use different formats for dumb-buffer allocation * and rendering as long as all involved formats share the same * color-mode constant. */ fourcc = drm_driver_color_mode_format(dev, args->bpp); if (fourcc != DRM_FORMAT_INVALID) { const struct drm_format_info *info = drm_format_info(fourcc); if (!info) return -EINVAL; pitch = drm_format_info_min_pitch(info, 0, args->width); } else if (args->bpp) { /* * Some userspace throws in arbitrary values for bpp and * relies on the kernel to figure it out. In this case we * fall back to the old method of using bpp directly. The * over-commitment of memory from the rounding is acceptable * for compatibility with legacy userspace. We have a number * of deprecated legacy values that are explicitly supported. */ switch (args->bpp) { default: drm_warn_once(dev, "Unknown color mode %u; guessing buffer size.\n", args->bpp); fallthrough; /* * These constants represent various YUV formats supported by * drm_gem_afbc_get_bpp(). */ case 12: // DRM_FORMAT_YUV420_8BIT case 15: // DRM_FORMAT_YUV420_10BIT case 30: // DRM_FORMAT_VUY101010 fallthrough; /* * Used by Mesa and Gstreamer to allocate NV formats and others * as RGB buffers. Technically, XRGB16161616F formats are RGB, * but the dumb buffers are not supposed to be used for anything * beyond 32 bits per pixels. */ case 10: // DRM_FORMAT_NV{15,20,30}, DRM_FORMAT_P010 case 64: // DRM_FORMAT_{XRGB,XBGR,ARGB,ABGR}16161616F pitch = args->width * DIV_ROUND_UP(args->bpp, SZ_8); break; } } if (!pitch || pitch > U32_MAX) return -EINVAL; args->pitch = pitch; return drm_mode_align_dumb(args, hw_pitch_align, hw_size_align); } EXPORT_SYMBOL(drm_mode_size_dumb); int drm_mode_create_dumb(struct drm_device *dev, struct drm_mode_create_dumb *args, struct drm_file *file_priv) { u32 cpp, stride, size; if (!dev->driver->dumb_create) return -ENOSYS; if (!args->width || !args->height || !args->bpp) return -EINVAL; /* overflow checks for 32bit size calculations */ if (args->bpp > U32_MAX - 8) return -EINVAL; cpp = DIV_ROUND_UP(args->bpp, 8); if (cpp > U32_MAX / args->width) return -EINVAL; stride = cpp * args->width; if (args->height > U32_MAX / stride) return -EINVAL; /* test for wrap-around */ size = args->height * stride; if (PAGE_ALIGN(size) == 0) return -EINVAL; /* * handle, pitch and size are output parameters. Zero them out to * prevent drivers from accidentally using uninitialized data. Since * not all existing userspace is clearing these fields properly we * cannot reject IOCTL with garbage in them. */ args->handle = 0; args->pitch = 0; args->size = 0; return dev->driver->dumb_create(file_priv, dev, args); } int drm_mode_create_dumb_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_mode_create_dumb *args = data; int err; err = drm_mode_create_dumb(dev, args, file_priv); if (err) { args->handle = 0; args->pitch = 0; args->size = 0; } return err; } static int drm_mode_mmap_dumb(struct drm_device *dev, struct drm_mode_map_dumb *args, struct drm_file *file_priv) { if (!dev->driver->dumb_create) return -ENOSYS; if (dev->driver->dumb_map_offset) return dev->driver->dumb_map_offset(file_priv, dev, args->handle, &args->offset); else return drm_gem_dumb_map_offset(file_priv, dev, args->handle, &args->offset); } /** * drm_mode_mmap_dumb_ioctl - create an mmap offset for a dumb backing storage buffer * @dev: DRM device * @data: ioctl data * @file_priv: DRM file info * * Allocate an offset in the drm device node's address space to be able to * memory map a dumb buffer. * * Called by the user via ioctl. * * Returns: * Zero on success, negative errno on failure. */ int drm_mode_mmap_dumb_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_mode_map_dumb *args = data; int err; err = drm_mode_mmap_dumb(dev, args, file_priv); if (err) args->offset = 0; return err; } int drm_mode_destroy_dumb(struct drm_device *dev, u32 handle, struct drm_file *file_priv) { if (!dev->driver->dumb_create) return -ENOSYS; return drm_gem_handle_delete(file_priv, handle); } int drm_mode_destroy_dumb_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_mode_destroy_dumb *args = data; return drm_mode_destroy_dumb(dev, args->handle, file_priv); }
216 216 121 82 7 75 6 136 101 100 167 94 124 57 56 13 155 76 45 232 138 157 217 205 4 2 2 5 7 357 318 321 102 293 3 13 32 73 167 158 78 77 73 13 5 256 227 80 2 225 89 21 27 27 37 22 402 114 48 40 45 165 161 63 3 69 112 71 211 182 115 48 112 2 21 75 53 47 66 51 81 130 130 83 53 60 48 48 202 67 212 212 35 163 62 50 78 130 212 125 142 159 218 235 13 20 141 213 21 211 118 14 355 2 40 1 284 4 44 8 15 36 4 84 84 1 4 82 82 1 82 1 23 79 4 32 3 42 78 27 20 52 31 16 10 74 74 7 17 8 17 18 71 71 10 7 5 15 11 20 5 71 65 56 24 71 71 9 71 45 45 42 30 5 45 10 43 8 44 8 44 44 49 6 1 6 74 20 4 4 53 15 22 46 46 14 19 4 13 6 2 10 20 40 7 7 74 74 49 74 5 74 45 74 73 1 71 32 71 71 70 33 70 70 70 47 1 10 86 86 72 68 86 3 32 84 84 3 13 75 16 75 84 19 28 28 66 72 75 75 75 71 36 72 12 71 48 48 47 19 70 68 70 84 84 83 75 84 84 84 86 86 86 86 5 5 3 3 3 3 3 5 5 5 4 509 66 14 21 62 7 2 5 1 1 7 5 2 7 7 7 2 1 5 3 3 3 7 7 7 7 11 78 321 73 62 7 81 67 62 62 2 62 62 60 2 62 94 9 9 174 55 51 55 13 55 55 16 40 10 45 50 116 10 110 80 70 22 70 71 71 70 3 71 7 2 2 2 1 2 67 67 67 66 66 7 1 12 11 3 2 2 1 6 8 6 49 29 3 85 8 85 23 93 94 93 11 94 65 79 8 3 3 51 3 2 54 82 72 3 49 76 76 76 12 6 8 12 12 86 3 3 6 89 90 70 3 67 357 326 387 12 218 69 355 357 357 191 422 293 334 168 334 122 313 7 31 384 6 5 358 346 270 31 22 71 56 323 75 5 343 343 347 71 226 202 31 348 347 63 347 87 338 292 270 239 98 58 278 387 70 347 5 342 231 71 56 341 386 385 385 271 291 290 289 50 26 3 82 37 56 324 191 358 352 201 68 394 297 391 393 330 393 281 212 174 89 91 177 392 77 2 36 25 73 44 67 63 79 18 18 17 18 86 86 180 335 40 24 8 8 12 12 12 16 382 13 381 78 386 396 1 1 276 353 393 136 362 54 359 241 393 387 86 393 391 71 393 391 1 387 386 387 385 38 387 87 49 231 345 229 357 386 387 385 77 77 76 1 77 74 8 3 8 4 60 3 2 5 510 512 512 48 285 7 15 11 124 204 1 126 204 3 227 211 125 212 6 2 87 64 10 7 10 95 7 7 301 23 223 279 24 205 303 296 6 4 3 17 3 30 3 8 4 2 2 28 361 32 7 8 4 132 131 131 103 28 99 6 27 129 81 22 58 1 2 79 10 47 23 25 54 10 18 49 9 138 7 92 3 25 119 49 45 5 5 5 22 22 7 24 11 4 17 11 22 56 7 54 48 22 23 12 3 24 37 46 18 18 8 176 30 173 144 128 124 29 31 196 197 35 54 48 63 7 68 68 62 39 54 15 291 291 50 30 84 14 4 64 30 30 56 19 19 13 14 15 56 19 4 56 35 4 79 83 83 296 265 151 296 119 277 277 30 1 21 9 4 2 22 7 271 271 353 354 192 300 300 253 251 3 21 65 193 191 68 220 71 48 55 242 54 254 230 112 10 243 117 117 119 4 16 16 85 47 16 42 565 552 47 29 47 29 41 18 25 22 42 16 42 42 38 42 22 42 41 36 42 15 49 14 16 16 15 16 15 16 37 27 11 11 11 11 3 49 49 2 47 30 11 37 6 119 125 110 125 426 406 125 110 77 180 376 286 270 2 133 283 231 148 4 54 74 77 32 241 269 82 82 82 74 3 2 4 377 347 81 347 57 2 61 5 375 374 22 338 4 1 1 1 1 373 2 23 27 49 5 25 316 122 4 119 12 6 288 289 200 105 257 36 109 107 2 45 46 46 98 33 63 23 71 70 28 75 262 260 258 1 253 253 253 1 261 261 261 261 261 142 59 83 139 2 78 64 37 37 37 7 31 7 31 79 3 202 217 93 3 3 174 98 82 1 83 83 82 82 46 37 82 7 13 1 14 63 2 124 8 2 6 117 123 506 508 6 17 1 3 129 296 69 191 190 21 180 1 4 117 3 116 119 119 2 117 119 115 6 74 73 3 2 1 1 58 1 10 170 2 2 55 17 8 1 167 168 104 105 82 25 2 11 11 122 121 120 121 121 16 16 16 96 2 1 1 2 123 123 16 1 114 12 122 121 27 96 9 12 98 105 98 12 3 9 105 98 98 12 98 27 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 // SPDX-License-Identifier: GPL-2.0 /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> */ /* * Changes: * Pedro Roque : Fast Retransmit/Recovery. * Two receive queues. * Retransmit queue handled by TCP. * Better retransmit timer handling. * New congestion avoidance. * Header prediction. * Variable renaming. * * Eric : Fast Retransmit. * Randy Scott : MSS option defines. * Eric Schenk : Fixes to slow start algorithm. * Eric Schenk : Yet another double ACK bug. * Eric Schenk : Delayed ACK bug fixes. * Eric Schenk : Floyd style fast retrans war avoidance. * David S. Miller : Don't allow zero congestion window. * Eric Schenk : Fix retransmitter so that it sends * next packet on ack of previous packet. * Andi Kleen : Moved open_request checking here * and process RSTs for open_requests. * Andi Kleen : Better prune_queue, and other fixes. * Andrey Savochkin: Fix RTT measurements in the presence of * timestamps. * Andrey Savochkin: Check sequence numbers correctly when * removing SACKs due to in sequence incoming * data segments. * Andi Kleen: Make sure we never ack data there is not * enough room for. Also make this condition * a fatal error if it might still happen. * Andi Kleen: Add tcp_measure_rcv_mss to make * connections with MSS<min(MTU,ann. MSS) * work without delayed acks. * Andi Kleen: Process packets with PSH set in the * fast path. * J Hadi Salim: ECN support * Andrei Gurtov, * Pasi Sarolahti, * Panu Kuhlberg: Experimental audit of TCP (re)transmission * engine. Lots of bugs are found. * Pasi Sarolahti: F-RTO for dealing with spurious RTOs */ #define pr_fmt(fmt) "TCP: " fmt #include <linux/mm.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/sysctl.h> #include <linux/kernel.h> #include <linux/prefetch.h> #include <linux/bitops.h> #include <net/dst.h> #include <net/tcp.h> #include <net/tcp_ecn.h> #include <net/proto_memory.h> #include <net/inet_common.h> #include <linux/ipsec.h> #include <linux/unaligned.h> #include <linux/errqueue.h> #include <trace/events/tcp.h> #include <linux/jump_label_ratelimit.h> #include <net/busy_poll.h> #include <net/mptcp.h> int sysctl_tcp_max_orphans __read_mostly = NR_FILE; #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ #define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ #define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */ #define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ #define FLAG_DATA_SACKED 0x20 /* New SACK. */ #define FLAG_ECE 0x40 /* ECE in this ACK */ #define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */ #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ #define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ #define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */ #define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ #define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ #define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ #define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */ #define FLAG_DSACK_TLP 0x20000 /* DSACK for tail loss probe */ #define FLAG_TS_PROGRESS 0x40000 /* Positive timestamp delta */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) #define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE|FLAG_DSACKING_ACK) #define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) #define REXMIT_NONE 0 /* no loss recovery to do */ #define REXMIT_LOST 1 /* retransmit packets marked lost */ #define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */ #if IS_ENABLED(CONFIG_TLS_DEVICE) static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ); void clean_acked_data_enable(struct tcp_sock *tp, void (*cad)(struct sock *sk, u32 ack_seq)) { tp->tcp_clean_acked = cad; static_branch_deferred_inc(&clean_acked_data_enabled); } EXPORT_SYMBOL_GPL(clean_acked_data_enable); void clean_acked_data_disable(struct tcp_sock *tp) { static_branch_slow_dec_deferred(&clean_acked_data_enabled); tp->tcp_clean_acked = NULL; } EXPORT_SYMBOL_GPL(clean_acked_data_disable); void clean_acked_data_flush(void) { static_key_deferred_flush(&clean_acked_data_enabled); } EXPORT_SYMBOL_GPL(clean_acked_data_flush); #endif #ifdef CONFIG_CGROUP_BPF static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) { bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown && BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG); bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG); struct bpf_sock_ops_kern sock_ops; if (likely(!unknown_opt && !parse_all_opt)) return; /* The skb will be handled in the * bpf_skops_established() or * bpf_skops_write_hdr_opt(). */ switch (sk->sk_state) { case TCP_SYN_RECV: case TCP_SYN_SENT: case TCP_LISTEN: return; } sock_owned_by_me(sk); memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB; sock_ops.is_fullsock = 1; sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); } static void bpf_skops_established(struct sock *sk, int bpf_op, struct sk_buff *skb) { struct bpf_sock_ops_kern sock_ops; sock_owned_by_me(sk); memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); sock_ops.op = bpf_op; sock_ops.is_fullsock = 1; sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */ if (skb) bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); } #else static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) { } static void bpf_skops_established(struct sock *sk, int bpf_op, struct sk_buff *skb) { } #endif static __cold void tcp_gro_dev_warn(const struct sock *sk, const struct sk_buff *skb, unsigned int len) { struct net_device *dev; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif); if (!dev || len >= READ_ONCE(dev->mtu)) pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n", dev ? dev->name : "Unknown driver"); rcu_read_unlock(); } /* Adapt the MSS value used to make delayed ack decision to the * real world. */ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); const unsigned int lss = icsk->icsk_ack.last_seg_size; unsigned int len; icsk->icsk_ack.last_seg_size = 0; /* skb->len may jitter because of SACKs, even if peer * sends good full-sized frames. */ len = skb_shinfo(skb)->gso_size ? : skb->len; if (len >= icsk->icsk_ack.rcv_mss) { /* Note: divides are still a bit expensive. * For the moment, only adjust scaling_ratio * when we update icsk_ack.rcv_mss. */ if (unlikely(len != icsk->icsk_ack.rcv_mss)) { u64 val = (u64)skb->len << TCP_RMEM_TO_WIN_SCALE; u8 old_ratio = tcp_sk(sk)->scaling_ratio; do_div(val, skb->truesize); tcp_sk(sk)->scaling_ratio = val ? val : 1; if (old_ratio != tcp_sk(sk)->scaling_ratio) { struct tcp_sock *tp = tcp_sk(sk); val = tcp_win_from_space(sk, sk->sk_rcvbuf); tcp_set_window_clamp(sk, val); if (tp->window_clamp < tp->rcvq_space.space) tp->rcvq_space.space = tp->window_clamp; } } icsk->icsk_ack.rcv_mss = min_t(unsigned int, len, tcp_sk(sk)->advmss); /* Account for possibly-removed options */ DO_ONCE_LITE_IF(len > icsk->icsk_ack.rcv_mss + MAX_TCP_OPTION_SPACE, tcp_gro_dev_warn, sk, skb, len); /* If the skb has a len of exactly 1*MSS and has the PSH bit * set then it is likely the end of an application write. So * more data may not be arriving soon, and yet the data sender * may be waiting for an ACK if cwnd-bound or using TX zero * copy. So we set ICSK_ACK_PUSHED here so that * tcp_cleanup_rbuf() will send an ACK immediately if the app * reads all of the data and is not ping-pong. If len > MSS * then this logic does not matter (and does not hurt) because * tcp_cleanup_rbuf() will always ACK immediately if the app * reads data and there is more than an MSS of unACKed data. */ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_PSH) icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; } else { /* Otherwise, we make more careful check taking into account, * that SACKs block is variable. * * "len" is invariant segment length, including TCP header. */ len += skb->data - skb_transport_header(skb); if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) || /* If PSH is not set, packet should be * full sized, provided peer TCP is not badly broken. * This observation (if it is correct 8)) allows * to handle super-low mtu links fairly. */ (len >= TCP_MIN_MSS + sizeof(struct tcphdr) && !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) { /* Subtract also invariant (if peer is RFC compliant), * tcp header plus fixed timestamp option length. * Resulting "len" is MSS free of SACK jitter. */ len -= tcp_sk(sk)->tcp_header_len; icsk->icsk_ack.last_seg_size = len; if (len == lss) { icsk->icsk_ack.rcv_mss = len; return; } } if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2; icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; } } static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks) { struct inet_connection_sock *icsk = inet_csk(sk); unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); if (quickacks == 0) quickacks = 2; quickacks = min(quickacks, max_quickacks); if (quickacks > icsk->icsk_ack.quick) icsk->icsk_ack.quick = quickacks; } static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_incr_quickack(sk, max_quickacks); inet_csk_exit_pingpong_mode(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; } /* Send ACKs quickly, if "quick" count is not exhausted * and the session is not interactive. */ static bool tcp_in_quickack_mode(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); return icsk->icsk_ack.dst_quick_ack || (icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk)); } static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); if (tcp_ecn_disabled(tp)) return; switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) { case INET_ECN_NOT_ECT: /* Funny extension: if ECT is not set on a segment, * and we already seen ECT on a previous segment, * it is probably a retransmit. */ if (tp->ecn_flags & TCP_ECN_SEEN) tcp_enter_quickack_mode(sk, 2); break; case INET_ECN_CE: if (tcp_ca_needs_ecn(sk)) tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR) && tcp_ecn_mode_rfc3168(tp)) { /* Better not delay acks, sender can have a very low cwnd */ tcp_enter_quickack_mode(sk, 2); tp->ecn_flags |= TCP_ECN_DEMAND_CWR; } /* As for RFC3168 ECN, the TCP_ECN_SEEN flag is set by * tcp_data_ecn_check() when the ECN codepoint of * received TCP data contains ECT(0), ECT(1), or CE. */ if (!tcp_ecn_mode_rfc3168(tp)) break; tp->ecn_flags |= TCP_ECN_SEEN; break; default: if (tcp_ca_needs_ecn(sk)) tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); if (!tcp_ecn_mode_rfc3168(tp)) break; tp->ecn_flags |= TCP_ECN_SEEN; break; } } /* Returns true if the byte counters can be used */ static bool tcp_accecn_process_option(struct tcp_sock *tp, const struct sk_buff *skb, u32 delivered_bytes, int flag) { u8 estimate_ecnfield = tp->est_ecnfield; bool ambiguous_ecn_bytes_incr = false; bool first_changed = false; unsigned int optlen; bool order1, res; unsigned int i; u8 *ptr; if (tcp_accecn_opt_fail_recv(tp)) return false; if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) { if (!tp->saw_accecn_opt) { /* Too late to enable after this point due to * potential counter wraps */ if (tp->bytes_sent >= (1 << 23) - 1) { u8 saw_opt = TCP_ACCECN_OPT_FAIL_SEEN; tcp_accecn_saw_opt_fail_recv(tp, saw_opt); } return false; } if (estimate_ecnfield) { u8 ecnfield = estimate_ecnfield - 1; tp->delivered_ecn_bytes[ecnfield] += delivered_bytes; return true; } return false; } ptr = skb_transport_header(skb) + tp->rx_opt.accecn; optlen = ptr[1] - 2; if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1)) return false; order1 = (ptr[0] == TCPOPT_ACCECN1); ptr += 2; if (tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { tp->saw_accecn_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn); if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV); } res = !!estimate_ecnfield; for (i = 0; i < 3; i++) { u32 init_offset; u8 ecnfield; s32 delta; u32 *cnt; if (optlen < TCPOLEN_ACCECN_PERFIELD) break; ecnfield = tcp_accecn_optfield_to_ecnfield(i, order1); init_offset = tcp_accecn_field_init_offset(ecnfield); cnt = &tp->delivered_ecn_bytes[ecnfield - 1]; delta = tcp_update_ecn_bytes(cnt, ptr, init_offset); if (delta && delta < 0) { res = false; ambiguous_ecn_bytes_incr = true; } if (delta && ecnfield != estimate_ecnfield) { if (!first_changed) { tp->est_ecnfield = ecnfield; first_changed = true; } else { res = false; ambiguous_ecn_bytes_incr = true; } } optlen -= TCPOLEN_ACCECN_PERFIELD; ptr += TCPOLEN_ACCECN_PERFIELD; } if (ambiguous_ecn_bytes_incr) tp->est_ecnfield = 0; return res; } static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count) { tp->delivered_ce += ecn_count; } /* Updates the delivered and delivered_ce counts */ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, bool ece_ack) { tp->delivered += delivered; if (tcp_ecn_mode_rfc3168(tp) && ece_ack) tcp_count_delivered_ce(tp, delivered); } /* Returns the ECN CE delta */ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, u32 delivered_pkts, u32 delivered_bytes, int flag) { u32 old_ceb = tcp_sk(sk)->delivered_ecn_bytes[INET_ECN_CE - 1]; const struct tcphdr *th = tcp_hdr(skb); struct tcp_sock *tp = tcp_sk(sk); u32 delta, safe_delta, d_ceb; bool opt_deltas_valid; u32 corrected_ace; /* Reordered ACK or uncertain due to lack of data to send and ts */ if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS))) return 0; opt_deltas_valid = tcp_accecn_process_option(tp, skb, delivered_bytes, flag); if (!(flag & FLAG_SLOWPATH)) { /* AccECN counter might overflow on large ACKs */ if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) return 0; } /* ACE field is not available during handshake */ if (flag & FLAG_SYN_ACKED) return 0; if (tp->received_ce_pending >= TCP_ACCECN_ACE_MAX_DELTA) inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; corrected_ace = tcp_accecn_ace(th) - TCP_ACCECN_CEP_INIT_OFFSET; delta = (corrected_ace - tp->delivered_ce) & TCP_ACCECN_CEP_ACE_MASK; if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) return delta; safe_delta = delivered_pkts - ((delivered_pkts - delta) & TCP_ACCECN_CEP_ACE_MASK); if (opt_deltas_valid) { d_ceb = tp->delivered_ecn_bytes[INET_ECN_CE - 1] - old_ceb; if (!d_ceb) return delta; if ((delivered_pkts >= (TCP_ACCECN_CEP_ACE_MASK + 1) * 2) && (tcp_is_sack(tp) || ((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_CWR)))) { u32 est_d_cep; if (delivered_bytes <= d_ceb) return safe_delta; est_d_cep = DIV_ROUND_UP_ULL((u64)d_ceb * delivered_pkts, delivered_bytes); return min(safe_delta, delta + (est_d_cep & ~TCP_ACCECN_CEP_ACE_MASK)); } if (d_ceb > delta * tp->mss_cache) return safe_delta; if (d_ceb < safe_delta * tp->mss_cache >> TCP_ACCECN_SAFETY_SHIFT) return delta; } return safe_delta; } static u32 tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, u32 delivered_pkts, u32 delivered_bytes, int *flag) { struct tcp_sock *tp = tcp_sk(sk); u32 delta; delta = __tcp_accecn_process(sk, skb, delivered_pkts, delivered_bytes, *flag); if (delta > 0) { tcp_count_delivered_ce(tp, delta); *flag |= FLAG_ECE; /* Recalculate header predictor */ if (tp->pred_flags) tcp_fast_path_on(tp); } return delta; } /* Buffer size and advertised window tuning. * * 1. Tuning sk->sk_sndbuf, when connection enters established state. */ static void tcp_sndbuf_expand(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; int sndmem, per_mss; u32 nr_segs; /* Worst case is non GSO/TSO : each frame consumes one skb * and skb->head is kmalloced using power of two area of memory */ per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + MAX_TCP_HEADER + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); per_mss = roundup_pow_of_two(per_mss) + SKB_DATA_ALIGN(sizeof(struct sk_buff)); nr_segs = max_t(u32, TCP_INIT_CWND, tcp_snd_cwnd(tp)); nr_segs = max_t(u32, nr_segs, tp->reordering + 1); /* Fast Recovery (RFC 5681 3.2) : * Cubic needs 1.7 factor, rounded to 2 to include * extra cushion (application might react slowly to EPOLLOUT) */ sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2; sndmem *= nr_segs * per_mss; if (sk->sk_sndbuf < sndmem) WRITE_ONCE(sk->sk_sndbuf, min(sndmem, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[2]))); } /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) * * All tcp_full_space() is split to two parts: "network" buffer, allocated * forward and advertised in receiver window (tp->rcv_wnd) and * "application buffer", required to isolate scheduling/application * latencies from network. * window_clamp is maximal advertised window. It can be less than * tcp_full_space(), in this case tcp_full_space() - window_clamp * is reserved for "application" buffer. The less window_clamp is * the smoother our behaviour from viewpoint of network, but the lower * throughput and the higher sensitivity of the connection to losses. 8) * * rcv_ssthresh is more strict window_clamp used at "slow start" * phase to predict further behaviour of this connection. * It is used for two goals: * - to enforce header prediction at sender, even when application * requires some significant "application buffer". It is check #1. * - to prevent pruning of receive queue because of misprediction * of receiver window. Check #2. * * The scheme does not work when sender sends good segments opening * window and then starts to feed us spaghetti. But it should work * in common situations. Otherwise, we have to rely on queue collapsing. */ /* Slow part of check#2. */ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb, unsigned int skbtruesize) { const struct tcp_sock *tp = tcp_sk(sk); /* Optimize this! */ int truesize = tcp_win_from_space(sk, skbtruesize) >> 1; int window = tcp_win_from_space(sk, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])) >> 1; while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) return 2 * inet_csk(sk)->icsk_ack.rcv_mss; truesize >>= 1; window >>= 1; } return 0; } /* Even if skb appears to have a bad len/truesize ratio, TCP coalescing * can play nice with us, as sk_buff and skb->head might be either * freed or shared with up to MAX_SKB_FRAGS segments. * Only give a boost to drivers using page frag(s) to hold the frame(s), * and if no payload was pulled in skb->head before reaching us. */ static u32 truesize_adjust(bool adjust, const struct sk_buff *skb) { u32 truesize = skb->truesize; if (adjust && !skb_headlen(skb)) { truesize -= SKB_TRUESIZE(skb_end_offset(skb)); /* paranoid check, some drivers might be buggy */ if (unlikely((int)truesize < (int)skb->len)) truesize = skb->truesize; } return truesize; } static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb, bool adjust) { struct tcp_sock *tp = tcp_sk(sk); int room; room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh; if (room <= 0) return; /* Check #1 */ if (!tcp_under_memory_pressure(sk)) { unsigned int truesize = truesize_adjust(adjust, skb); int incr; /* Check #2. Increase window, if skb with such overhead * will fit to rcvbuf in future. */ if (tcp_win_from_space(sk, truesize) <= skb->len) incr = 2 * tp->advmss; else incr = __tcp_grow_window(sk, skb, truesize); if (incr) { incr = max_t(int, incr, 2 * skb->len); tp->rcv_ssthresh += min(room, incr); inet_csk(sk)->icsk_ack.quick |= 1; } } else { /* Under pressure: * Adjust rcv_ssthresh according to reserved mem */ tcp_adjust_rcv_ssthresh(sk); } } /* 3. Try to fixup all. It is made immediately after connection enters * established state. */ static void tcp_init_buffer_space(struct sock *sk) { int tcp_app_win = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_app_win); struct tcp_sock *tp = tcp_sk(sk); int maxwin; if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) tcp_sndbuf_expand(sk); tcp_mstamp_refresh(tp); tp->rcvq_space.time = tp->tcp_mstamp; tp->rcvq_space.seq = tp->copied_seq; maxwin = tcp_full_space(sk); if (tp->window_clamp >= maxwin) { WRITE_ONCE(tp->window_clamp, maxwin); if (tcp_app_win && maxwin > 4 * tp->advmss) WRITE_ONCE(tp->window_clamp, max(maxwin - (maxwin >> tcp_app_win), 4 * tp->advmss)); } /* Force reservation of one segment. */ if (tcp_app_win && tp->window_clamp > 2 * tp->advmss && tp->window_clamp + tp->advmss > maxwin) WRITE_ONCE(tp->window_clamp, max(2 * tp->advmss, maxwin - tp->advmss)); tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); tp->snd_cwnd_stamp = tcp_jiffies32; tp->rcvq_space.space = min3(tp->rcv_ssthresh, tp->rcv_wnd, (u32)TCP_INIT_CWND * tp->advmss); } /* 4. Recalculate window clamp after socket hit its memory bounds. */ static void tcp_clamp_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct net *net = sock_net(sk); int rmem2; icsk->icsk_ack.quick = 0; rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]); if (sk->sk_rcvbuf < rmem2 && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && !tcp_under_memory_pressure(sk) && sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { WRITE_ONCE(sk->sk_rcvbuf, min(atomic_read(&sk->sk_rmem_alloc), rmem2)); } if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); } /* Initialize RCV_MSS value. * RCV_MSS is an our guess about MSS used by the peer. * We haven't any direct information about the MSS. * It's better to underestimate the RCV_MSS rather than overestimate. * Overestimations make us ACKing less frequently than needed. * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss(). */ void tcp_initialize_rcv_mss(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); hint = min(hint, tp->rcv_wnd / 2); hint = min(hint, TCP_MSS_DEFAULT); hint = max(hint, TCP_MIN_MSS); inet_csk(sk)->icsk_ack.rcv_mss = hint; } EXPORT_IPV6_MOD(tcp_initialize_rcv_mss); /* Receiver "autotuning" code. * * The algorithm for RTT estimation w/o timestamps is based on * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL. * <https://public.lanl.gov/radiant/pubs.html#DRS> * * More detail on this code can be found at * <http://staff.psc.edu/jheffner/>, * though this reference is out of date. A new paper * is pending. */ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) { u32 new_sample, old_sample = tp->rcv_rtt_est.rtt_us; long m = sample << 3; if (old_sample == 0 || m < old_sample) { new_sample = m; } else { /* If we sample in larger samples in the non-timestamp * case, we could grossly overestimate the RTT especially * with chatty applications or bulk transfer apps which * are stalled on filesystem I/O. * * Also, since we are only going for a minimum in the * non-timestamp case, we do not smooth things out * else with timestamps disabled convergence takes too * long. */ if (win_dep) return; /* Do not use this sample if receive queue is not empty. */ if (tp->rcv_nxt != tp->copied_seq) return; new_sample = old_sample - (old_sample >> 3) + sample; } tp->rcv_rtt_est.rtt_us = new_sample; } static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) { u32 delta_us; if (tp->rcv_rtt_est.time == 0) goto new_measure; if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) return; delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time); if (!delta_us) delta_us = 1; tcp_rcv_rtt_update(tp, delta_us, 1); new_measure: tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd; tp->rcv_rtt_est.time = tp->tcp_mstamp; } static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp, u32 min_delta) { u32 delta, delta_us; delta = tcp_time_stamp_ts(tp) - tp->rx_opt.rcv_tsecr; if (tp->tcp_usec_ts) return delta; if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { if (!delta) delta = min_delta; delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); return delta_us; } return -1; } static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); if (tp->rx_opt.rcv_tsecr == tp->rcv_rtt_last_tsecr) return; tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr; if (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) { s32 delta = tcp_rtt_tsopt_us(tp, 0); if (delta > 0) tcp_rcv_rtt_update(tp, delta, 0); } } void tcp_rcvbuf_grow(struct sock *sk, u32 newval) { const struct net *net = sock_net(sk); struct tcp_sock *tp = tcp_sk(sk); u32 rcvwin, rcvbuf, cap, oldval; u32 rtt_threshold, rtt_us; u64 grow; oldval = tp->rcvq_space.space; tp->rcvq_space.space = newval; if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) || (sk->sk_userlocks & SOCK_RCVBUF_LOCK)) return; /* DRS is always one RTT late. */ rcvwin = newval << 1; rtt_us = tp->rcv_rtt_est.rtt_us >> 3; rtt_threshold = READ_ONCE(net->ipv4.sysctl_tcp_rcvbuf_low_rtt); if (rtt_us < rtt_threshold) { /* For small RTT, we set @grow to rcvwin * rtt_us/rtt_threshold. * It might take few additional ms to reach 'line rate', * but will avoid sk_rcvbuf inflation and poor cache use. */ grow = div_u64((u64)rcvwin * rtt_us, rtt_threshold); } else { /* slow start: allow the sender to double its rate. */ grow = div_u64(((u64)rcvwin << 1) * (newval - oldval), oldval); } rcvwin += grow; if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) rcvwin += TCP_SKB_CB(tp->ooo_last_skb)->end_seq - tp->rcv_nxt; cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]); rcvbuf = min_t(u32, tcp_space_from_win(sk, rcvwin), cap); if (rcvbuf > sk->sk_rcvbuf) { WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); /* Make the window clamp follow along. */ WRITE_ONCE(tp->window_clamp, tcp_win_from_space(sk, rcvbuf)); } } /* * This function should be called every time data is copied to user space. * It calculates the appropriate TCP receive buffer space. */ void tcp_rcv_space_adjust(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); int time, inq, copied; trace_tcp_rcv_space_adjust(sk); if (unlikely(!tp->rcv_rtt_est.rtt_us)) return; /* We do not refresh tp->tcp_mstamp here. * Some platforms have expensive ktime_get() implementations. * Using the last cached value is enough for DRS. */ time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time); if (time < (tp->rcv_rtt_est.rtt_us >> 3)) return; /* Number of bytes copied to user in last RTT */ copied = tp->copied_seq - tp->rcvq_space.seq; /* Number of bytes in receive queue. */ inq = tp->rcv_nxt - tp->copied_seq; copied -= inq; if (copied <= tp->rcvq_space.space) goto new_measure; trace_tcp_rcvbuf_grow(sk, time); tcp_rcvbuf_grow(sk, copied); new_measure: tp->rcvq_space.seq = tp->copied_seq; tp->rcvq_space.time = tp->tcp_mstamp; } static void tcp_save_lrcv_flowlabel(struct sock *sk, const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IPV6) struct inet_connection_sock *icsk = inet_csk(sk); if (skb->protocol == htons(ETH_P_IPV6)) icsk->icsk_ack.lrcv_flowlabel = ntohl(ip6_flowlabel(ipv6_hdr(skb))); #endif } /* There is something which you must keep in mind when you analyze the * behavior of the tp->ato delayed ack timeout interval. When a * connection starts up, we want to ack as quickly as possible. The * problem is that "good" TCP's do slow start at the beginning of data * transmission. The means that until we send the first few ACK's the * sender will sit on his end and only queue most of his data, because * he can only send snd_cwnd unacked packets at any given time. For * each ACK we send, he increments snd_cwnd and transmits more of his * queue. -DaveM */ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); u32 now; inet_csk_schedule_ack(sk); tcp_measure_rcv_mss(sk, skb); tcp_rcv_rtt_measure(tp); now = tcp_jiffies32; if (!icsk->icsk_ack.ato) { /* The _first_ data packet received, initialize * delayed ACK engine. */ tcp_incr_quickack(sk, TCP_MAX_QUICKACKS); icsk->icsk_ack.ato = TCP_ATO_MIN; } else { int m = now - icsk->icsk_ack.lrcvtime; if (m <= TCP_ATO_MIN / 2) { /* The fastest case is the first. */ icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2; } else if (m < icsk->icsk_ack.ato) { icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m; if (icsk->icsk_ack.ato > icsk->icsk_rto) icsk->icsk_ack.ato = icsk->icsk_rto; } else if (m > icsk->icsk_rto) { /* Too long gap. Apparently sender failed to * restart window, so that we send ACKs quickly. */ tcp_incr_quickack(sk, TCP_MAX_QUICKACKS); } } icsk->icsk_ack.lrcvtime = now; tcp_save_lrcv_flowlabel(sk, skb); tcp_data_ecn_check(sk, skb); if (skb->len >= 128) tcp_grow_window(sk, skb, true); } /* Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 * piece by Van Jacobson. * NOTE: the next three routines used to be one big routine. * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) { struct tcp_sock *tp = tcp_sk(sk); long m = mrtt_us; /* RTT */ u32 srtt = tp->srtt_us; /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev * are scaled versions of rtt and mean deviation. * This is designed to be as fast as possible * m stands for "measurement". * * On a 1990 paper the rto value is changed to: * RTO = rtt + 4 * mdev * * Funny. This algorithm seems to be very broken. * These formulae increase RTO, when it should be decreased, increase * too slowly, when it should be increased quickly, decrease too quickly * etc. I guess in BSD RTO takes ONE value, so that it is absolutely * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) */ if (srtt != 0) { m -= (srtt >> 3); /* m is now error in rtt est */ srtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) { m = -m; /* m is now abs(error) */ m -= (tp->mdev_us >> 2); /* similar update on mdev */ /* This is similar to one of Eifel findings. * Eifel blocks mdev updates when rtt decreases. * This solution is a bit different: we use finer gain * for mdev in this case (alpha*beta). * Like Eifel it also prevents growth of rto, * but also it limits too fast rto decreases, * happening in pure Eifel. */ if (m > 0) m >>= 3; } else { m -= (tp->mdev_us >> 2); /* similar update on mdev */ } tp->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ if (tp->mdev_us > tp->mdev_max_us) { tp->mdev_max_us = tp->mdev_us; if (tp->mdev_max_us > tp->rttvar_us) tp->rttvar_us = tp->mdev_max_us; } if (after(tp->snd_una, tp->rtt_seq)) { if (tp->mdev_max_us < tp->rttvar_us) tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2; tp->rtt_seq = tp->snd_nxt; tp->mdev_max_us = tcp_rto_min_us(sk); tcp_bpf_rtt(sk, mrtt_us, srtt); } } else { /* no previous measure. */ srtt = m << 3; /* take the measured time to be rtt */ tp->mdev_us = m << 1; /* make sure rto = 3*rtt */ tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk)); tp->mdev_max_us = tp->rttvar_us; tp->rtt_seq = tp->snd_nxt; tcp_bpf_rtt(sk, mrtt_us, srtt); } tp->srtt_us = max(1U, srtt); } void tcp_update_pacing_rate(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); u64 rate; /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3); /* current rate is (cwnd * mss) / srtt * In Slow Start [1], set sk_pacing_rate to 200 % the current rate. * In Congestion Avoidance phase, set it to 120 % the current rate. * * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh) * If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching * end of slow start and should slow down. */ if (tcp_snd_cwnd(tp) < tp->snd_ssthresh / 2) rate *= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio); else rate *= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio); rate *= max(tcp_snd_cwnd(tp), tp->packets_out); if (likely(tp->srtt_us)) do_div(rate, tp->srtt_us); /* WRITE_ONCE() is needed because sch_fq fetches sk_pacing_rate * without any lock. We want to make sure compiler wont store * intermediate values in this location. */ WRITE_ONCE(sk->sk_pacing_rate, min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate))); } /* Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ void tcp_set_rto(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); /* Old crap is replaced with new one. 8) * * More seriously: * 1. If rtt variance happened to be less 50msec, it is hallucination. * It cannot be less due to utterly erratic ACK generation made * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ * to do with delayed acks, because at cwnd>2 true delack timeout * is invisible. Actually, Linux-2.4 also generates erratic * ACKs in some circumstances. */ inet_csk(sk)->icsk_rto = __tcp_set_rto(tp); /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, * all the algo is pure shit and should be replaced * with correct one. It is exactly, which we pretend to do. */ /* NOTE: clamping at TCP_RTO_MIN is not required, current algo * guarantees that rto is higher. */ tcp_bound_rto(sk); } __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) { __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); if (!cwnd) cwnd = TCP_INIT_CWND; return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } struct tcp_sacktag_state { /* Timestamps for earliest and latest never-retransmitted segment * that was SACKed. RTO needs the earliest RTT to stay conservative, * but congestion control should still get an accurate delay signal. */ u64 first_sackt; u64 last_sackt; u32 reord; u32 sack_delivered; u32 delivered_bytes; int flag; unsigned int mss_now; struct rate_sample *rate; }; /* Take a notice that peer is sending D-SACKs. Skip update of data delivery * and spurious retransmission information if this DSACK is unlikely caused by * sender's action: * - DSACKed sequence range is larger than maximum receiver's window. * - Total no. of DSACKed segments exceed the total no. of retransmitted segs. */ static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq, u32 end_seq, struct tcp_sacktag_state *state) { u32 seq_len, dup_segs = 1; if (!before(start_seq, end_seq)) return 0; seq_len = end_seq - start_seq; /* Dubious DSACK: DSACKed range greater than maximum advertised rwnd */ if (seq_len > tp->max_window) return 0; if (seq_len > tp->mss_cache) dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache); else if (tp->tlp_high_seq && tp->tlp_high_seq == end_seq) state->flag |= FLAG_DSACK_TLP; tp->dsack_dups += dup_segs; /* Skip the DSACK if dup segs weren't retransmitted by sender */ if (tp->dsack_dups > tp->total_retrans) return 0; tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; /* We increase the RACK ordering window in rounds where we receive * DSACKs that may have been due to reordering causing RACK to trigger * a spurious fast recovery. Thus RACK ignores DSACKs that happen * without having seen reordering, or that match TLP probes (TLP * is timer-driven, not triggered by RACK). */ if (tp->reord_seen && !(state->flag & FLAG_DSACK_TLP)) tp->rack.dsack_seen = 1; state->flag |= FLAG_DSACKING_ACK; /* A spurious retransmission is delivered */ state->sack_delivered += dup_segs; return dup_segs; } /* It's reordering when higher sequence was delivered (i.e. sacked) before * some lower never-retransmitted sequence ("low_seq"). The maximum reordering * distance is approximated in full-mss packet distance ("reordering"). */ static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq, const int ts) { struct tcp_sock *tp = tcp_sk(sk); const u32 mss = tp->mss_cache; u32 fack, metric; fack = tcp_highest_sack_seq(tp); if (!before(low_seq, fack)) return; metric = fack - low_seq; if ((metric > tp->reordering * mss) && mss) { #if FASTRETRANS_DEBUG > 1 pr_debug("Disorder%d %d %u f%u s%u rr%d\n", tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, tp->reordering, 0, tp->sacked_out, tp->undo_marker ? tp->undo_retrans : 0); #endif tp->reordering = min_t(u32, (metric + mss - 1) / mss, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering)); } /* This exciting event is worth to be remembered. 8) */ tp->reord_seen++; NET_INC_STATS(sock_net(sk), ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER); } /* This must be called before lost_out or retrans_out are updated * on a new loss, because we want to know if all skbs previously * known to be lost have already been retransmitted, indicating * that this newly lost skb is our next skb to retransmit. */ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) { if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) || (tp->retransmit_skb_hint && before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->retransmit_skb_hint)->seq))) tp->retransmit_skb_hint = skb; } /* Sum the number of packets on the wire we have marked as lost, and * notify the congestion control module that the given skb was marked lost. */ static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) { tp->lost += tcp_skb_pcount(skb); } void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) { __u8 sacked = TCP_SKB_CB(skb)->sacked; struct tcp_sock *tp = tcp_sk(sk); if (sacked & TCPCB_SACKED_ACKED) return; tcp_verify_retransmit_hint(tp, skb); if (sacked & TCPCB_LOST) { if (sacked & TCPCB_SACKED_RETRANS) { /* Account for retransmits that are lost again */ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT, tcp_skb_pcount(skb)); tcp_notify_skb_loss_event(tp, skb); } } else { tp->lost_out += tcp_skb_pcount(skb); TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tcp_notify_skb_loss_event(tp, skb); } } /* This procedure tags the retransmission queue when SACKs arrive. * * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). * Packets in queue with these bits set are counted in variables * sacked_out, retrans_out and lost_out, correspondingly. * * Valid combinations are: * Tag InFlight Description * 0 1 - orig segment is in flight. * S 0 - nothing flies, orig reached receiver. * L 0 - nothing flies, orig lost by net. * R 2 - both orig and retransmit are in flight. * L|R 1 - orig is lost, retransmit is in flight. * S|R 1 - orig reached receiver, retrans is still in flight. * (L|S|R is logically valid, it could occur when L|R is sacked, * but it is equivalent to plain S and code short-circuits it to S. * L|S is logically invalid, it would mean -1 packet in flight 8)) * * These 6 states form finite state machine, controlled by the following events: * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue()) * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue()) * 3. Loss detection event of two flavors: * A. Scoreboard estimator decided the packet is lost. * A'. Reno "three dupacks" marks head of queue lost. * B. SACK arrives sacking SND.NXT at the moment, when the * segment was retransmitted. * 4. D-SACK added new rule: D-SACK changes any tag to S. * * It is pleasant to note, that state diagram turns out to be commutative, * so that we are allowed not to be bothered by order of our actions, * when multiple events arrive simultaneously. (see the function below). * * Reordering detection. * -------------------- * Reordering metric is maximal distance, which a packet can be displaced * in packet stream. With SACKs we can estimate it: * * 1. SACK fills old hole and the corresponding segment was not * ever retransmitted -> reordering. Alas, we cannot use it * when segment was retransmitted. * 2. The last flaw is solved with D-SACK. D-SACK arrives * for retransmitted and already SACKed segment -> reordering.. * Both of these heuristics are not used in Loss state, when we cannot * account for retransmits accurately. * * SACK block validation. * ---------------------- * * SACK block range validation checks that the received SACK block fits to * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT. * Note that SND.UNA is not included to the range though being valid because * it means that the receiver is rather inconsistent with itself reporting * SACK reneging when it should advance SND.UNA. Such SACK block this is * perfectly valid, however, in light of RFC2018 which explicitly states * that "SACK block MUST reflect the newest segment. Even if the newest * segment is going to be discarded ...", not that it looks very clever * in case of head skb. Due to potentional receiver driven attacks, we * choose to avoid immediate execution of a walk in write queue due to * reneging and defer head skb's loss recovery to standard loss recovery * procedure that will eventually trigger (nothing forbids us doing this). * * Implements also blockage to start_seq wrap-around. Problem lies in the * fact that though start_seq (s) is before end_seq (i.e., not reversed), * there's no guarantee that it will be before snd_nxt (n). The problem * happens when start_seq resides between end_seq wrap (e_w) and snd_nxt * wrap (s_w): * * <- outs wnd -> <- wrapzone -> * u e n u_w e_w s n_w * | | | | | | | * |<------------+------+----- TCP seqno space --------------+---------->| * ...-- <2^31 ->| |<--------... * ...---- >2^31 ------>| |<--------... * * Current code wouldn't be vulnerable but it's better still to discard such * crazy SACK blocks. Doing this check for start_seq alone closes somewhat * similar case (end_seq after snd_nxt wrap) as earlier reversed check in * snd_nxt wrap -> snd_una region will then become "well defined", i.e., * equal to the ideal case (infinite seqno space without wrap caused issues). * * With D-SACK the lower bound is extended to cover sequence space below * SND.UNA down to undo_marker, which is the last point of interest. Yet * again, D-SACK block must not to go across snd_una (for the same reason as * for the normal SACK blocks, explained above). But there all simplicity * ends, TCP might receive valid D-SACKs below that. As long as they reside * fully below undo_marker they do not affect behavior in anyway and can * therefore be safely ignored. In rare cases (which are more or less * theoretical ones), the D-SACK will nicely cross that boundary due to skb * fragmentation and packet reordering past skb's retransmission. To consider * them correctly, the acceptable range must be extended even more though * the exact amount is rather hard to quantify. However, tp->max_window can * be used as an exaggerated estimate. */ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack, u32 start_seq, u32 end_seq) { /* Too far in future, or reversed (interpretation is ambiguous) */ if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq)) return false; /* Nasty start_seq wrap-around check (see comments above) */ if (!before(start_seq, tp->snd_nxt)) return false; /* In outstanding window? ...This is valid exit for D-SACKs too. * start_seq == snd_una is non-sensical (see comments above) */ if (after(start_seq, tp->snd_una)) return true; if (!is_dsack || !tp->undo_marker) return false; /* ...Then it's D-SACK, and must reside below snd_una completely */ if (after(end_seq, tp->snd_una)) return false; if (!before(start_seq, tp->undo_marker)) return true; /* Too old */ if (!after(end_seq, tp->undo_marker)) return false; /* Undo_marker boundary crossing (overestimates a lot). Known already: * start_seq < undo_marker and end_seq >= undo_marker. */ return !before(start_seq, end_seq - tp->max_window); } static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp, int num_sacks, u32 prior_snd_una, struct tcp_sacktag_state *state) { struct tcp_sock *tp = tcp_sk(sk); u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq); u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq); u32 dup_segs; if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV); } else if (num_sacks > 1) { u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq); u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq); if (after(end_seq_0, end_seq_1) || before(start_seq_0, start_seq_1)) return false; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV); } else { return false; } dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state); if (!dup_segs) { /* Skip dubious DSACK */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKIGNOREDDUBIOUS); return false; } NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs); /* D-SACK for already forgotten data... Do dumb counting. */ if (tp->undo_marker && tp->undo_retrans > 0 && !after(end_seq_0, prior_snd_una) && after(end_seq_0, tp->undo_marker)) tp->undo_retrans = max_t(int, 0, tp->undo_retrans - dup_segs); return true; } /* Check if skb is fully within the SACK block. In presence of GSO skbs, * the incoming SACK may not exactly match but we can find smaller MSS * aligned portion of it that matches. Therefore we might need to fragment * which may fail and creates some hassle (caller must handle error case * returns). * * FIXME: this could be merged to shift decision code */ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, u32 start_seq, u32 end_seq) { int err; bool in_sack; unsigned int pkt_len; unsigned int mss; in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && !before(end_seq, TCP_SKB_CB(skb)->end_seq); if (tcp_skb_pcount(skb) > 1 && !in_sack && after(TCP_SKB_CB(skb)->end_seq, start_seq)) { mss = tcp_skb_mss(skb); in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq); if (!in_sack) { pkt_len = start_seq - TCP_SKB_CB(skb)->seq; if (pkt_len < mss) pkt_len = mss; } else { pkt_len = end_seq - TCP_SKB_CB(skb)->seq; if (pkt_len < mss) return -EINVAL; } /* Round if necessary so that SACKs cover only full MSSes * and/or the remaining small portion (if present) */ if (pkt_len > mss) { unsigned int new_len = (pkt_len / mss) * mss; if (!in_sack && new_len < pkt_len) new_len += mss; pkt_len = new_len; } if (pkt_len >= skb->len && !in_sack) return 0; err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, pkt_len, mss, GFP_ATOMIC); if (err < 0) return err; } return in_sack; } /* Mark the given newly-SACKed range as such, adjusting counters and hints. */ static u8 tcp_sacktag_one(struct sock *sk, struct tcp_sacktag_state *state, u8 sacked, u32 start_seq, u32 end_seq, int dup_sack, int pcount, u32 plen, u64 xmit_time) { struct tcp_sock *tp = tcp_sk(sk); /* Account D-SACK for retransmitted packet. */ if (dup_sack && (sacked & TCPCB_RETRANS)) { if (tp->undo_marker && tp->undo_retrans > 0 && after(end_seq, tp->undo_marker)) tp->undo_retrans = max_t(int, 0, tp->undo_retrans - pcount); if ((sacked & TCPCB_SACKED_ACKED) && before(start_seq, state->reord)) state->reord = start_seq; } /* Nothing to do; acked frame is about to be dropped (was ACKed). */ if (!after(end_seq, tp->snd_una)) return sacked; if (!(sacked & TCPCB_SACKED_ACKED)) { tcp_rack_advance(tp, sacked, end_seq, xmit_time); if (sacked & TCPCB_SACKED_RETRANS) { /* If the segment is not tagged as lost, * we do not clear RETRANS, believing * that retransmission is still in flight. */ if (sacked & TCPCB_LOST) { sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); tp->lost_out -= pcount; tp->retrans_out -= pcount; } } else { if (!(sacked & TCPCB_RETRANS)) { /* New sack for not retransmitted frame, * which was in hole. It is reordering. */ if (before(start_seq, tcp_highest_sack_seq(tp)) && before(start_seq, state->reord)) state->reord = start_seq; if (!after(end_seq, tp->high_seq)) state->flag |= FLAG_ORIG_SACK_ACKED; if (state->first_sackt == 0) state->first_sackt = xmit_time; state->last_sackt = xmit_time; } if (sacked & TCPCB_LOST) { sacked &= ~TCPCB_LOST; tp->lost_out -= pcount; } } sacked |= TCPCB_SACKED_ACKED; state->flag |= FLAG_DATA_SACKED; tp->sacked_out += pcount; /* Out-of-order packets delivered */ state->sack_delivered += pcount; state->delivered_bytes += plen; } /* D-SACK. We can detect redundant retransmission in S|R and plain R * frames and clear it. undo_retrans is decreased above, L|R frames * are accounted above as well. */ if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) { sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= pcount; } return sacked; } /* Shift newly-SACKed bytes from this skb to the immediately previous * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. */ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, struct sk_buff *skb, struct tcp_sacktag_state *state, unsigned int pcount, int shifted, int mss, bool dup_sack) { struct tcp_sock *tp = tcp_sk(sk); u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */ u32 end_seq = start_seq + shifted; /* end of newly-SACKed */ BUG_ON(!pcount); /* Adjust counters and hints for the newly sacked sequence * range but discard the return value since prev is already * marked. We must tag the range first because the seq * advancement below implicitly advances * tcp_highest_sack_seq() when skb is highest_sack. */ tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, start_seq, end_seq, dup_sack, pcount, skb->len, tcp_skb_timestamp_us(skb)); tcp_rate_skb_delivered(sk, skb, state->rate); TCP_SKB_CB(prev)->end_seq += shifted; TCP_SKB_CB(skb)->seq += shifted; tcp_skb_pcount_add(prev, pcount); WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); tcp_skb_pcount_add(skb, -pcount); /* When we're adding to gso_segs == 1, gso_size will be zero, * in theory this shouldn't be necessary but as long as DSACK * code can come after this skb later on it's better to keep * setting gso_size to something. */ if (!TCP_SKB_CB(prev)->tcp_gso_size) TCP_SKB_CB(prev)->tcp_gso_size = mss; /* CHECKME: To clear or not to clear? Mimics normal skb currently */ if (tcp_skb_pcount(skb) <= 1) TCP_SKB_CB(skb)->tcp_gso_size = 0; /* Difference in this won't matter, both ACKed by the same cumul. ACK */ TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); if (skb->len > 0) { BUG_ON(!tcp_skb_pcount(skb)); NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTED); return false; } /* Whole SKB was eaten :-) */ if (skb == tp->retransmit_skb_hint) tp->retransmit_skb_hint = prev; TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) TCP_SKB_CB(prev)->end_seq++; if (skb == tcp_highest_sack(sk)) tcp_advance_highest_sack(sk, skb); tcp_skb_collapse_tstamp(prev, skb); if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp)) TCP_SKB_CB(prev)->tx.delivered_mstamp = 0; tcp_rtx_queue_unlink_and_free(skb, sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED); return true; } /* I wish gso_size would have a bit more sane initialization than * something-or-zero which complicates things */ static int tcp_skb_seglen(const struct sk_buff *skb) { return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb); } /* Shifting pages past head area doesn't work */ static int skb_can_shift(const struct sk_buff *skb) { return !skb_headlen(skb) && skb_is_nonlinear(skb); } int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, int shiftlen) { /* TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE) * Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need * to make sure not storing more than 65535 * 8 bytes per skb, * even if current MSS is bigger. */ if (unlikely(to->len + shiftlen >= 65535 * TCP_MIN_GSO_SIZE)) return 0; if (unlikely(tcp_skb_pcount(to) + pcount > 65535)) return 0; return skb_shift(to, from, shiftlen); } /* Try collapsing SACK blocks spanning across multiple skbs to a single * skb. */ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, struct tcp_sacktag_state *state, u32 start_seq, u32 end_seq, bool dup_sack) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *prev; int mss; int pcount = 0; int len; int in_sack; /* Normally R but no L won't result in plain S */ if (!dup_sack && (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS) goto fallback; if (!skb_can_shift(skb)) goto fallback; /* This frame is about to be dropped (was ACKed). */ if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) goto fallback; /* Can only happen with delayed DSACK + discard craziness */ prev = skb_rb_prev(skb); if (!prev) goto fallback; if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) goto fallback; if (!tcp_skb_can_collapse(prev, skb)) goto fallback; in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && !before(end_seq, TCP_SKB_CB(skb)->end_seq); if (in_sack) { len = skb->len; pcount = tcp_skb_pcount(skb); mss = tcp_skb_seglen(skb); /* TODO: Fix DSACKs to not fragment already SACKed and we can * drop this restriction as unnecessary */ if (mss != tcp_skb_seglen(prev)) goto fallback; } else { if (!after(TCP_SKB_CB(skb)->end_seq, start_seq)) goto noop; /* CHECKME: This is non-MSS split case only?, this will * cause skipped skbs due to advancing loop btw, original * has that feature too */ if (tcp_skb_pcount(skb) <= 1) goto noop; in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq); if (!in_sack) { /* TODO: head merge to next could be attempted here * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)), * though it might not be worth of the additional hassle * * ...we can probably just fallback to what was done * previously. We could try merging non-SACKed ones * as well but it probably isn't going to buy off * because later SACKs might again split them, and * it would make skb timestamp tracking considerably * harder problem. */ goto fallback; } len = end_seq - TCP_SKB_CB(skb)->seq; BUG_ON(len < 0); BUG_ON(len > skb->len); /* MSS boundaries should be honoured or else pcount will * severely break even though it makes things bit trickier. * Optimize common case to avoid most of the divides */ mss = tcp_skb_mss(skb); /* TODO: Fix DSACKs to not fragment already SACKed and we can * drop this restriction as unnecessary */ if (mss != tcp_skb_seglen(prev)) goto fallback; if (len == mss) { pcount = 1; } else if (len < mss) { goto noop; } else { pcount = len / mss; len = pcount * mss; } } /* tcp_sacktag_one() won't SACK-tag ranges below snd_una */ if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una)) goto fallback; if (!tcp_skb_shift(prev, skb, pcount, len)) goto fallback; if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack)) goto out; /* Hole filled allows collapsing with the next as well, this is very * useful when hole on every nth skb pattern happens */ skb = skb_rb_next(prev); if (!skb) goto out; if (!skb_can_shift(skb) || ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) || (mss != tcp_skb_seglen(skb))) goto out; if (!tcp_skb_can_collapse(prev, skb)) goto out; len = skb->len; pcount = tcp_skb_pcount(skb); if (tcp_skb_shift(prev, skb, pcount, len)) tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, 0); out: return prev; noop: return skb; fallback: NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK); return NULL; } static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, struct tcp_sack_block *next_dup, struct tcp_sacktag_state *state, u32 start_seq, u32 end_seq, bool dup_sack_in) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *tmp; skb_rbtree_walk_from(skb) { int in_sack = 0; bool dup_sack = dup_sack_in; /* queue is in-order => we can short-circuit the walk early */ if (!before(TCP_SKB_CB(skb)->seq, end_seq)) break; if (next_dup && before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) { in_sack = tcp_match_skb_to_sack(sk, skb, next_dup->start_seq, next_dup->end_seq); if (in_sack > 0) dup_sack = true; } /* skb reference here is a bit tricky to get right, since * shifting can eat and free both this skb and the next, * so not even _safe variant of the loop is enough. */ if (in_sack <= 0) { tmp = tcp_shift_skb_data(sk, skb, state, start_seq, end_seq, dup_sack); if (tmp) { if (tmp != skb) { skb = tmp; continue; } in_sack = 0; } else { in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, end_seq); } } if (unlikely(in_sack < 0)) break; if (in_sack) { TCP_SKB_CB(skb)->sacked = tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, dup_sack, tcp_skb_pcount(skb), skb->len, tcp_skb_timestamp_us(skb)); tcp_rate_skb_delivered(sk, skb, state->rate); if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) list_del_init(&skb->tcp_tsorted_anchor); if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) tcp_advance_highest_sack(sk, skb); } } return skb; } static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, u32 seq) { struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node; struct sk_buff *skb; while (*p) { parent = *p; skb = rb_to_skb(parent); if (before(seq, TCP_SKB_CB(skb)->seq)) { p = &parent->rb_left; continue; } if (!before(seq, TCP_SKB_CB(skb)->end_seq)) { p = &parent->rb_right; continue; } return skb; } return NULL; } static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, u32 skip_to_seq) { if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq)) return skb; return tcp_sacktag_bsearch(sk, skip_to_seq); } static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, struct sock *sk, struct tcp_sack_block *next_dup, struct tcp_sacktag_state *state, u32 skip_to_seq) { if (!next_dup) return skb; if (before(next_dup->start_seq, skip_to_seq)) { skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq); skb = tcp_sacktag_walk(skb, sk, NULL, state, next_dup->start_seq, next_dup->end_seq, 1); } return skb; } static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_block *cache) { return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); } static int tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, u32 prior_snd_una, struct tcp_sacktag_state *state) { struct tcp_sock *tp = tcp_sk(sk); const unsigned char *ptr = (skb_transport_header(ack_skb) + TCP_SKB_CB(ack_skb)->sacked); struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); struct tcp_sack_block sp[TCP_NUM_SACKS]; struct tcp_sack_block *cache; struct sk_buff *skb; int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3); int used_sacks; bool found_dup_sack = false; int i, j; int first_sack_index; state->flag = 0; state->reord = tp->snd_nxt; if (!tp->sacked_out) tcp_highest_sack_reset(sk); found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, num_sacks, prior_snd_una, state); /* Eliminate too old ACKs, but take into * account more or less fresh ones, they can * contain valid SACK info. */ if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window)) return 0; if (!tp->packets_out) goto out; used_sacks = 0; first_sack_index = 0; for (i = 0; i < num_sacks; i++) { bool dup_sack = !i && found_dup_sack; sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq); sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq); if (!tcp_is_sackblock_valid(tp, dup_sack, sp[used_sacks].start_seq, sp[used_sacks].end_seq)) { int mib_idx; if (dup_sack) { if (!tp->undo_marker) mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO; else mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD; } else { /* Don't count olds caused by ACK reordering */ if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) && !after(sp[used_sacks].end_seq, tp->snd_una)) continue; mib_idx = LINUX_MIB_TCPSACKDISCARD; } NET_INC_STATS(sock_net(sk), mib_idx); if (i == 0) first_sack_index = -1; continue; } /* Ignore very old stuff early */ if (!after(sp[used_sacks].end_seq, prior_snd_una)) { if (i == 0) first_sack_index = -1; continue; } used_sacks++; } /* order SACK blocks to allow in order walk of the retrans queue */ for (i = used_sacks - 1; i > 0; i--) { for (j = 0; j < i; j++) { if (after(sp[j].start_seq, sp[j + 1].start_seq)) { swap(sp[j], sp[j + 1]); /* Track where the first SACK block goes to */ if (j == first_sack_index) first_sack_index = j + 1; } } } state->mss_now = tcp_current_mss(sk); skb = NULL; i = 0; if (!tp->sacked_out) { /* It's already past, so skip checking against it */ cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); } else { cache = tp->recv_sack_cache; /* Skip empty blocks in at head of the cache */ while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq && !cache->end_seq) cache++; } while (i < used_sacks) { u32 start_seq = sp[i].start_seq; u32 end_seq = sp[i].end_seq; bool dup_sack = (found_dup_sack && (i == first_sack_index)); struct tcp_sack_block *next_dup = NULL; if (found_dup_sack && ((i + 1) == first_sack_index)) next_dup = &sp[i + 1]; /* Skip too early cached blocks */ while (tcp_sack_cache_ok(tp, cache) && !before(start_seq, cache->end_seq)) cache++; /* Can skip some work by looking recv_sack_cache? */ if (tcp_sack_cache_ok(tp, cache) && !dup_sack && after(end_seq, cache->start_seq)) { /* Head todo? */ if (before(start_seq, cache->start_seq)) { skb = tcp_sacktag_skip(skb, sk, start_seq); skb = tcp_sacktag_walk(skb, sk, next_dup, state, start_seq, cache->start_seq, dup_sack); } /* Rest of the block already fully processed? */ if (!after(end_seq, cache->end_seq)) goto advance_sp; skb = tcp_maybe_skipping_dsack(skb, sk, next_dup, state, cache->end_seq); /* ...tail remains todo... */ if (tcp_highest_sack_seq(tp) == cache->end_seq) { /* ...but better entrypoint exists! */ skb = tcp_highest_sack(sk); if (!skb) break; cache++; goto walk; } skb = tcp_sacktag_skip(skb, sk, cache->end_seq); /* Check overlap against next cached too (past this one already) */ cache++; continue; } if (!before(start_seq, tcp_highest_sack_seq(tp))) { skb = tcp_highest_sack(sk); if (!skb) break; } skb = tcp_sacktag_skip(skb, sk, start_seq); walk: skb = tcp_sacktag_walk(skb, sk, next_dup, state, start_seq, end_seq, dup_sack); advance_sp: i++; } /* Clear the head of the cache sack blocks so we can skip it next time */ for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) { tp->recv_sack_cache[i].start_seq = 0; tp->recv_sack_cache[i].end_seq = 0; } for (j = 0; j < used_sacks; j++) tp->recv_sack_cache[i++] = sp[j]; if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss || tp->undo_marker) tcp_check_sack_reordering(sk, state->reord, 0); tcp_verify_left_out(tp); out: #if FASTRETRANS_DEBUG > 0 WARN_ON((int)tp->sacked_out < 0); WARN_ON((int)tp->lost_out < 0); WARN_ON((int)tp->retrans_out < 0); WARN_ON((int)tcp_packets_in_flight(tp) < 0); #endif return state->flag; } /* Limits sacked_out so that sum with lost_out isn't ever larger than * packets_out. Returns false if sacked_out adjustement wasn't necessary. */ static bool tcp_limit_reno_sacked(struct tcp_sock *tp) { u32 holes; holes = max(tp->lost_out, 1U); holes = min(holes, tp->packets_out); if ((tp->sacked_out + holes) > tp->packets_out) { tp->sacked_out = tp->packets_out - holes; return true; } return false; } /* If we receive more dupacks than we expected counting segments * in assumption of absent reordering, interpret this as reordering. * The only another reason could be bug in receiver TCP. */ static void tcp_check_reno_reordering(struct sock *sk, const int addend) { struct tcp_sock *tp = tcp_sk(sk); if (!tcp_limit_reno_sacked(tp)) return; tp->reordering = min_t(u32, tp->packets_out + addend, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering)); tp->reord_seen++; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER); } /* Emulate SACKs for SACKless connection: account for a new dupack. */ static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack) { if (num_dupack) { struct tcp_sock *tp = tcp_sk(sk); u32 prior_sacked = tp->sacked_out; s32 delivered; tp->sacked_out += num_dupack; tcp_check_reno_reordering(sk, 0); delivered = tp->sacked_out - prior_sacked; if (delivered > 0) tcp_count_delivered(tp, delivered, ece_ack); tcp_verify_left_out(tp); } } /* Account for ACK, ACKing some data in Reno Recovery phase. */ static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack) { struct tcp_sock *tp = tcp_sk(sk); if (acked > 0) { /* One ACK acked hole. The rest eat duplicate ACKs. */ tcp_count_delivered(tp, max_t(int, acked - tp->sacked_out, 1), ece_ack); if (acked - 1 >= tp->sacked_out) tp->sacked_out = 0; else tp->sacked_out -= acked - 1; } tcp_check_reno_reordering(sk, acked); tcp_verify_left_out(tp); } static inline void tcp_reset_reno_sack(struct tcp_sock *tp) { tp->sacked_out = 0; } void tcp_clear_retrans(struct tcp_sock *tp) { tp->retrans_out = 0; tp->lost_out = 0; tp->undo_marker = 0; tp->undo_retrans = -1; tp->sacked_out = 0; tp->rto_stamp = 0; tp->total_rto = 0; tp->total_rto_recoveries = 0; tp->total_rto_time = 0; } static inline void tcp_init_undo(struct tcp_sock *tp) { tp->undo_marker = tp->snd_una; /* Retransmission still in flight may cause DSACKs later. */ /* First, account for regular retransmits in flight: */ tp->undo_retrans = tp->retrans_out; /* Next, account for TLP retransmits in flight: */ if (tp->tlp_high_seq && tp->tlp_retrans) tp->undo_retrans++; /* Finally, avoid 0, because undo_retrans==0 means "can undo now": */ if (!tp->undo_retrans) tp->undo_retrans = -1; } /* If we detect SACK reneging, forget all SACK information * and reset tags completely, otherwise preserve SACKs. If receiver * dropped its ofo queue, we will know this due to reneging detection. */ static void tcp_timeout_mark_lost(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb, *head; bool is_reneg; /* is receiver reneging on SACKs? */ head = tcp_rtx_queue_head(sk); is_reneg = head && (TCP_SKB_CB(head)->sacked & TCPCB_SACKED_ACKED); if (is_reneg) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); tp->sacked_out = 0; /* Mark SACK reneging until we recover from this loss event. */ tp->is_sack_reneg = 1; } else if (tcp_is_reno(tp)) { tcp_reset_reno_sack(tp); } skb = head; skb_rbtree_walk_from(skb) { if (is_reneg) TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; else if (skb != head && tcp_rack_skb_timeout(tp, skb, 0) > 0) continue; /* Don't mark recently sent ones lost yet */ tcp_mark_skb_lost(sk, skb); } tcp_verify_left_out(tp); tcp_clear_all_retrans_hints(tp); } /* Enter Loss state. */ void tcp_enter_loss(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; u8 reordering; tcp_timeout_mark_lost(sk); /* Reduce ssthresh if it has not yet been made inside this window. */ if (icsk->icsk_ca_state <= TCP_CA_Disorder || !after(tp->high_seq, tp->snd_una) || (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(sk); tp->prior_cwnd = tcp_snd_cwnd(tp); tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); tcp_ca_event(sk, CA_EVENT_LOSS); tcp_init_undo(tp); } tcp_snd_cwnd_set(tp, tcp_packets_in_flight(tp) + 1); tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_jiffies32; /* Timeout in disordered state after receiving substantial DUPACKs * suggests that the degree of reordering is over-estimated. */ reordering = READ_ONCE(net->ipv4.sysctl_tcp_reordering); if (icsk->icsk_ca_state <= TCP_CA_Disorder && tp->sacked_out >= reordering) tp->reordering = min_t(unsigned int, tp->reordering, reordering); tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; tp->tlp_high_seq = 0; tcp_ecn_queue_cwr(tp); /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous * loss recovery is underway except recurring timeout(s) on * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing */ tp->frto = READ_ONCE(net->ipv4.sysctl_tcp_frto) && (new_recovery || icsk->icsk_retransmits) && !inet_csk(sk)->icsk_mtup.probe_size; } /* If ACK arrived pointing to a remembered SACK, it means that our * remembered SACKs do not reflect real state of receiver i.e. * receiver _host_ is heavily congested (or buggy). * * To avoid big spurious retransmission bursts due to transient SACK * scoreboard oddities that look like reneging, we give the receiver a * little time (max(RTT/2, 10ms)) to send us some more ACKs that will * restore sanity to the SACK scoreboard. If the apparent reneging * persists until this RTO then we'll clear the SACK scoreboard. */ static bool tcp_check_sack_reneging(struct sock *sk, int *ack_flag) { if (*ack_flag & FLAG_SACK_RENEGING && *ack_flag & FLAG_SND_UNA_ADVANCED) { struct tcp_sock *tp = tcp_sk(sk); unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4), msecs_to_jiffies(10)); tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, false); *ack_flag &= ~FLAG_SET_XMIT_TIMER; return true; } return false; } /* Linux NewReno/SACK/ECN state machine. * -------------------------------------- * * "Open" Normal state, no dubious events, fast path. * "Disorder" In all the respects it is "Open", * but requires a bit more attention. It is entered when * we see some SACKs or dupacks. It is split of "Open" * mainly to move some processing from fast path to slow one. * "CWR" CWND was reduced due to some Congestion Notification event. * It can be ECN, ICMP source quench, local device congestion. * "Recovery" CWND was reduced, we are fast-retransmitting. * "Loss" CWND was reduced due to RTO timeout or SACK reneging. * * tcp_fastretrans_alert() is entered: * - each incoming ACK, if state is not "Open" * - when arrived ACK is unusual, namely: * * SACK * * Duplicate ACK. * * ECN ECE. * * Counting packets in flight is pretty simple. * * in_flight = packets_out - left_out + retrans_out * * packets_out is SND.NXT-SND.UNA counted in packets. * * retrans_out is number of retransmitted segments. * * left_out is number of segments left network, but not ACKed yet. * * left_out = sacked_out + lost_out * * sacked_out: Packets, which arrived to receiver out of order * and hence not ACKed. With SACKs this number is simply * amount of SACKed data. Even without SACKs * it is easy to give pretty reliable estimate of this number, * counting duplicate ACKs. * * lost_out: Packets lost by network. TCP has no explicit * "loss notification" feedback from network (for now). * It means that this number can be only _guessed_. * Actually, it is the heuristics to predict lossage that * distinguishes different algorithms. * * F.e. after RTO, when all the queue is considered as lost, * lost_out = packets_out and in_flight = retrans_out. * * Essentially, we have now a few algorithms detecting * lost packets. * * If the receiver supports SACK: * * RACK (RFC8985): RACK is a newer loss detection algorithm * (2017-) that checks timing instead of counting DUPACKs. * Essentially a packet is considered lost if it's not S/ACKed * after RTT + reordering_window, where both metrics are * dynamically measured and adjusted. This is implemented in * tcp_rack_mark_lost. * * If the receiver does not support SACK: * * NewReno (RFC6582): in Recovery we assume that one segment * is lost (classic Reno). While we are in Recovery and * a partial ACK arrives, we assume that one more packet * is lost (NewReno). This heuristics are the same in NewReno * and SACK. * * The really tricky (and requiring careful tuning) part of the algorithm * is hidden in the RACK code in tcp_recovery.c and tcp_xmit_retransmit_queue(). * The first determines the moment _when_ we should reduce CWND and, * hence, slow down forward transmission. In fact, it determines the moment * when we decide that hole is caused by loss, rather than by a reorder. * * tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill * holes, caused by lost packets. * * And the most logically complicated part of algorithm is undo * heuristics. We detect false retransmits due to both too early * fast retransmit (reordering) and underestimated RTO, analyzing * timestamps and D-SACKs. When we detect that some segments were * retransmitted by mistake and CWND reduction was wrong, we undo * window reduction and abort recovery phase. This logic is hidden * inside several functions named tcp_try_undo_<something>. */ /* This function decides, when we should leave Disordered state * and enter Recovery phase, reducing congestion window. * * Main question: may we further continue forward transmission * with the same cwnd? */ static bool tcp_time_to_recover(const struct tcp_sock *tp) { /* Has loss detection marked at least one packet lost? */ return tp->lost_out != 0; } static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when) { return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && before(tp->rx_opt.rcv_tsecr, when); } /* skb is spurious retransmitted if the returned timestamp echo * reply is prior to the skb transmission time */ static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp, const struct sk_buff *skb) { return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) && tcp_tsopt_ecr_before(tp, tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb)); } /* Nothing was retransmitted or returned timestamp is less * than timestamp of the first retransmission. */ static inline bool tcp_packet_delayed(const struct tcp_sock *tp) { const struct sock *sk = (const struct sock *)tp; /* Received an echoed timestamp before the first retransmission? */ if (tp->retrans_stamp) return tcp_tsopt_ecr_before(tp, tp->retrans_stamp); /* We set tp->retrans_stamp upon the first retransmission of a loss * recovery episode, so normally if tp->retrans_stamp is 0 then no * retransmission has happened yet (likely due to TSQ, which can cause * fast retransmits to be delayed). So if snd_una advanced while * (tp->retrans_stamp is 0 then apparently a packet was merely delayed, * not lost. But there are exceptions where we retransmit but then * clear tp->retrans_stamp, so we check for those exceptions. */ /* (1) For non-SACK connections, tcp_is_non_sack_preventing_reopen() * clears tp->retrans_stamp when snd_una == high_seq. */ if (!tcp_is_sack(tp) && !before(tp->snd_una, tp->high_seq)) return false; /* (2) In TCP_SYN_SENT tcp_clean_rtx_queue() clears tp->retrans_stamp * when setting FLAG_SYN_ACKED is set, even if the SYN was * retransmitted. */ if (sk->sk_state == TCP_SYN_SENT) return false; return true; /* tp->retrans_stamp is zero; no retransmit yet */ } /* Undo procedures. */ /* We can clear retrans_stamp when there are no retransmissions in the * window. It would seem that it is trivially available for us in * tp->retrans_out, however, that kind of assumptions doesn't consider * what will happen if errors occur when sending retransmission for the * second time. ...It could the that such segment has only * TCPCB_EVER_RETRANS set at the present time. It seems that checking * the head skb is enough except for some reneging corner cases that * are not worth the effort. * * Main reason for all this complexity is the fact that connection dying * time now depends on the validity of the retrans_stamp, in particular, * that successive retransmissions of a segment must not advance * retrans_stamp under any conditions. */ static bool tcp_any_retrans_done(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; if (tp->retrans_out) return true; skb = tcp_rtx_queue_head(sk); if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) return true; return false; } /* If loss recovery is finished and there are no retransmits out in the * network, then we clear retrans_stamp so that upon the next loss recovery * retransmits_timed_out() and timestamp-undo are using the correct value. */ static void tcp_retrans_stamp_cleanup(struct sock *sk) { if (!tcp_any_retrans_done(sk)) tcp_sk(sk)->retrans_stamp = 0; } static void DBGUNDO(struct sock *sk, const char *msg) { #if FASTRETRANS_DEBUG > 1 struct tcp_sock *tp = tcp_sk(sk); struct inet_sock *inet = inet_sk(sk); if (sk->sk_family == AF_INET) { pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n", msg, &inet->inet_daddr, ntohs(inet->inet_dport), tcp_snd_cwnd(tp), tcp_left_out(tp), tp->snd_ssthresh, tp->prior_ssthresh, tp->packets_out); } #if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n", msg, &sk->sk_v6_daddr, ntohs(inet->inet_dport), tcp_snd_cwnd(tp), tcp_left_out(tp), tp->snd_ssthresh, tp->prior_ssthresh, tp->packets_out); } #endif #endif } static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) { struct tcp_sock *tp = tcp_sk(sk); if (unmark_loss) { struct sk_buff *skb; skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; } tp->lost_out = 0; tcp_clear_all_retrans_hints(tp); } if (tp->prior_ssthresh) { const struct inet_connection_sock *icsk = inet_csk(sk); tcp_snd_cwnd_set(tp, icsk->icsk_ca_ops->undo_cwnd(sk)); if (tp->prior_ssthresh > tp->snd_ssthresh) { tp->snd_ssthresh = tp->prior_ssthresh; tcp_ecn_withdraw_cwr(tp); } } tp->snd_cwnd_stamp = tcp_jiffies32; tp->undo_marker = 0; tp->rack.advanced = 1; /* Force RACK to re-exam losses */ } static inline bool tcp_may_undo(const struct tcp_sock *tp) { return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp)); } static bool tcp_is_non_sack_preventing_reopen(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { /* Hold old state until something *above* high_seq * is ACKed. For Reno it is MUST to prevent false * fast retransmits (RFC2582). SACK TCP is safe. */ if (!tcp_any_retrans_done(sk)) tp->retrans_stamp = 0; return true; } return false; } /* People celebrate: "We love our President!" */ static bool tcp_try_undo_recovery(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (tcp_may_undo(tp)) { int mib_idx; /* Happy end! We did not retransmit anything * or our original transmission succeeded. */ DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); tcp_undo_cwnd_reduction(sk, false); if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) mib_idx = LINUX_MIB_TCPLOSSUNDO; else mib_idx = LINUX_MIB_TCPFULLUNDO; NET_INC_STATS(sock_net(sk), mib_idx); } else if (tp->rack.reo_wnd_persist) { tp->rack.reo_wnd_persist--; } if (tcp_is_non_sack_preventing_reopen(sk)) return true; tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; return false; } /* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */ static bool tcp_try_undo_dsack(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (tp->undo_marker && !tp->undo_retrans) { tp->rack.reo_wnd_persist = min(TCP_RACK_RECOVERY_THRESH, tp->rack.reo_wnd_persist + 1); DBGUNDO(sk, "D-SACK"); tcp_undo_cwnd_reduction(sk, false); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); return true; } return false; } /* Undo during loss recovery after partial ACK or using F-RTO. */ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) { struct tcp_sock *tp = tcp_sk(sk); if (frto_undo || tcp_may_undo(tp)) { tcp_undo_cwnd_reduction(sk, true); DBGUNDO(sk, "partial loss"); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); if (frto_undo) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); WRITE_ONCE(inet_csk(sk)->icsk_retransmits, 0); if (tcp_is_non_sack_preventing_reopen(sk)) return true; if (frto_undo || tcp_is_sack(tp)) { tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; } return true; } return false; } /* The cwnd reduction in CWR and Recovery uses the PRR algorithm in RFC 6937. * It computes the number of packets to send (sndcnt) based on packets newly * delivered: * 1) If the packets in flight is larger than ssthresh, PRR spreads the * cwnd reductions across a full RTT. * 2) Otherwise PRR uses packet conservation to send as much as delivered. * But when SND_UNA is acked without further losses, * slow starts cwnd up to ssthresh to speed up the recovery. */ static void tcp_init_cwnd_reduction(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); tp->high_seq = tp->snd_nxt; tp->tlp_high_seq = 0; tp->snd_cwnd_cnt = 0; tp->prior_cwnd = tcp_snd_cwnd(tp); tp->prr_delivered = 0; tp->prr_out = 0; tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); tcp_ecn_queue_cwr(tp); } void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag) { struct tcp_sock *tp = tcp_sk(sk); int sndcnt = 0; int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp); if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd)) return; trace_tcp_cwnd_reduction_tp(sk, newly_acked_sacked, newly_lost, flag); tp->prr_delivered += newly_acked_sacked; if (delta < 0) { u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + tp->prior_cwnd - 1; sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; } else { sndcnt = max_t(int, tp->prr_delivered - tp->prr_out, newly_acked_sacked); if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost) sndcnt++; sndcnt = min(delta, sndcnt); } /* Force a fast retransmit upon entering fast recovery */ sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1)); tcp_snd_cwnd_set(tp, tcp_packets_in_flight(tp) + sndcnt); } static inline void tcp_end_cwnd_reduction(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (inet_csk(sk)->icsk_ca_ops->cong_control) return; /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH && (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || tp->undo_marker)) { tcp_snd_cwnd_set(tp, tp->snd_ssthresh); tp->snd_cwnd_stamp = tcp_jiffies32; } tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); } /* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */ void tcp_enter_cwr(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); tp->prior_ssthresh = 0; if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { tp->undo_marker = 0; tcp_init_cwnd_reduction(sk); tcp_set_ca_state(sk, TCP_CA_CWR); } } EXPORT_SYMBOL(tcp_enter_cwr); static void tcp_try_keep_open(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); int state = TCP_CA_Open; if (tcp_left_out(tp) || tcp_any_retrans_done(sk)) state = TCP_CA_Disorder; if (inet_csk(sk)->icsk_ca_state != state) { tcp_set_ca_state(sk, state); tp->high_seq = tp->snd_nxt; } } static void tcp_try_to_open(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); tcp_verify_left_out(tp); if (!tcp_any_retrans_done(sk)) tp->retrans_stamp = 0; if (flag & FLAG_ECE) tcp_enter_cwr(sk); if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { tcp_try_keep_open(sk); } } static void tcp_mtup_probe_failed(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1; icsk->icsk_mtup.probe_size = 0; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPFAIL); } static void tcp_mtup_probe_success(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); u64 val; tp->prior_ssthresh = tcp_current_ssthresh(sk); val = (u64)tcp_snd_cwnd(tp) * tcp_mss_to_mtu(sk, tp->mss_cache); do_div(val, icsk->icsk_mtup.probe_size); DEBUG_NET_WARN_ON_ONCE((u32)val != val); tcp_snd_cwnd_set(tp, max_t(u32, 1U, val)); tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_jiffies32; tp->snd_ssthresh = tcp_current_ssthresh(sk); icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size; icsk->icsk_mtup.probe_size = 0; tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS); } /* Sometimes we deduce that packets have been dropped due to reasons other than * congestion, like path MTU reductions or failed client TFO attempts. In these * cases we call this function to retransmit as many packets as cwnd allows, * without reducing cwnd. Given that retransmits will set retrans_stamp to a * non-zero value (and may do so in a later calling context due to TSQ), we * also enter CA_Loss so that we track when all retransmitted packets are ACKed * and clear retrans_stamp when that happens (to ensure later recurring RTOs * are using the correct retrans_stamp and don't declare ETIMEDOUT * prematurely). */ static void tcp_non_congestion_loss_retransmit(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); if (icsk->icsk_ca_state != TCP_CA_Loss) { tp->high_seq = tp->snd_nxt; tp->snd_ssthresh = tcp_current_ssthresh(sk); tp->prior_ssthresh = 0; tp->undo_marker = 0; tcp_set_ca_state(sk, TCP_CA_Loss); } tcp_xmit_retransmit_queue(sk); } /* Do a simple retransmit without using the backoff mechanisms in * tcp_timer. This is used for path mtu discovery. * The socket is already locked here. */ void tcp_simple_retransmit(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int mss; /* A fastopen SYN request is stored as two separate packets within * the retransmit queue, this is done by tcp_send_syn_data(). * As a result simply checking the MSS of the frames in the queue * will not work for the SYN packet. * * Us being here is an indication of a path MTU issue so we can * assume that the fastopen SYN was lost and just mark all the * frames in the retransmit queue as lost. We will use an MSS of * -1 to mark all frames as lost, otherwise compute the current MSS. */ if (tp->syn_data && sk->sk_state == TCP_SYN_SENT) mss = -1; else mss = tcp_current_mss(sk); skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { if (tcp_skb_seglen(skb) > mss) tcp_mark_skb_lost(sk, skb); } if (!tp->lost_out) return; if (tcp_is_reno(tp)) tcp_limit_reno_sacked(tp); tcp_verify_left_out(tp); /* Don't muck with the congestion window here. * Reason is that we do not increase amount of _data_ * in network, but units changed and effective * cwnd/ssthresh really reduced now. */ tcp_non_congestion_loss_retransmit(sk); } EXPORT_IPV6_MOD(tcp_simple_retransmit); void tcp_enter_recovery(struct sock *sk, bool ece_ack) { struct tcp_sock *tp = tcp_sk(sk); int mib_idx; /* Start the clock with our fast retransmit, for undo and ETIMEDOUT. */ tcp_retrans_stamp_cleanup(sk); if (tcp_is_reno(tp)) mib_idx = LINUX_MIB_TCPRENORECOVERY; else mib_idx = LINUX_MIB_TCPSACKRECOVERY; NET_INC_STATS(sock_net(sk), mib_idx); tp->prior_ssthresh = 0; tcp_init_undo(tp); if (!tcp_in_cwnd_reduction(sk)) { if (!ece_ack) tp->prior_ssthresh = tcp_current_ssthresh(sk); tcp_init_cwnd_reduction(sk); } tcp_set_ca_state(sk, TCP_CA_Recovery); } static void tcp_update_rto_time(struct tcp_sock *tp) { if (tp->rto_stamp) { tp->total_rto_time += tcp_time_stamp_ms(tp) - tp->rto_stamp; tp->rto_stamp = 0; } } /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are * recovered or spurious. Otherwise retransmits more on partial ACKs. */ static void tcp_process_loss(struct sock *sk, int flag, int num_dupack, int *rexmit) { struct tcp_sock *tp = tcp_sk(sk); bool recovered = !before(tp->snd_una, tp->high_seq); if ((flag & FLAG_SND_UNA_ADVANCED || rcu_access_pointer(tp->fastopen_rsk)) && tcp_try_undo_loss(sk, false)) return; if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ /* Step 3.b. A timeout is spurious if not all data are * lost, i.e., never-retransmitted data are (s)acked. */ if ((flag & FLAG_ORIG_SACK_ACKED) && tcp_try_undo_loss(sk, true)) return; if (after(tp->snd_nxt, tp->high_seq)) { if (flag & FLAG_DATA_SACKED || num_dupack) tp->frto = 0; /* Step 3.a. loss was real */ } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) { tp->high_seq = tp->snd_nxt; /* Step 2.b. Try send new data (but deferred until cwnd * is updated in tcp_ack()). Otherwise fall back to * the conventional recovery. */ if (!tcp_write_queue_empty(sk) && after(tcp_wnd_end(tp), tp->snd_nxt)) { *rexmit = REXMIT_NEW; return; } tp->frto = 0; } } if (recovered) { /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */ tcp_try_undo_recovery(sk); return; } if (tcp_is_reno(tp)) { /* A Reno DUPACK means new data in F-RTO step 2.b above are * delivered. Lower inflight to clock out (re)transmissions. */ if (after(tp->snd_nxt, tp->high_seq) && num_dupack) tcp_add_reno_sack(sk, num_dupack, flag & FLAG_ECE); else if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); } *rexmit = REXMIT_LOST; } /* Undo during fast recovery after partial ACK. */ static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una) { struct tcp_sock *tp = tcp_sk(sk); if (tp->undo_marker && tcp_packet_delayed(tp)) { /* Plain luck! Hole if filled with delayed * packet, rather than with a retransmit. Check reordering. */ tcp_check_sack_reordering(sk, prior_snd_una, 1); /* We are getting evidence that the reordering degree is higher * than we realized. If there are no retransmits out then we * can undo. Otherwise we clock out new packets but do not * mark more packets lost or retransmit more. */ if (tp->retrans_out) return true; if (!tcp_any_retrans_done(sk)) tp->retrans_stamp = 0; DBGUNDO(sk, "partial recovery"); tcp_undo_cwnd_reduction(sk, true); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); tcp_try_keep_open(sk); } return false; } static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag) { struct tcp_sock *tp = tcp_sk(sk); if (tcp_rtx_queue_empty(sk)) return; if (unlikely(tcp_is_reno(tp))) { tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED); } else { u32 prior_retrans = tp->retrans_out; if (tcp_rack_mark_lost(sk)) *ack_flag &= ~FLAG_SET_XMIT_TIMER; if (prior_retrans > tp->retrans_out) *ack_flag |= FLAG_LOST_RETRANS; } } /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and * packets lost by network. * * Besides that it updates the congestion state when packet loss or ECN * is detected. But it does not reduce the cwnd, it is done by the * congestion control later. * * It does _not_ decide what to send, it is made in function * tcp_xmit_retransmit_queue(). */ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, int num_dupack, int *ack_flag, int *rexmit) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int flag = *ack_flag; bool ece_ack = flag & FLAG_ECE; if (!tp->packets_out && tp->sacked_out) tp->sacked_out = 0; /* Now state machine starts. * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ if (ece_ack) tp->prior_ssthresh = 0; /* B. In all the states check for reneging SACKs. */ if (tcp_check_sack_reneging(sk, ack_flag)) return; /* C. Check consistency of the current state. */ tcp_verify_left_out(tp); /* D. Check state exit conditions. State can be terminated * when high_seq is ACKed. */ if (icsk->icsk_ca_state == TCP_CA_Open) { WARN_ON(tp->retrans_out != 0 && !tp->syn_data); tp->retrans_stamp = 0; } else if (!before(tp->snd_una, tp->high_seq)) { switch (icsk->icsk_ca_state) { case TCP_CA_CWR: /* CWR is to be held something *above* high_seq * is ACKed for CWR bit to reach receiver. */ if (tp->snd_una != tp->high_seq) { tcp_end_cwnd_reduction(sk); tcp_set_ca_state(sk, TCP_CA_Open); } break; case TCP_CA_Recovery: if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); if (tcp_try_undo_recovery(sk)) return; tcp_end_cwnd_reduction(sk); break; } } /* E. Process state. */ switch (icsk->icsk_ca_state) { case TCP_CA_Recovery: if (!(flag & FLAG_SND_UNA_ADVANCED)) { if (tcp_is_reno(tp)) tcp_add_reno_sack(sk, num_dupack, ece_ack); } else if (tcp_try_undo_partial(sk, prior_snd_una)) return; if (tcp_try_undo_dsack(sk)) tcp_try_to_open(sk, flag); tcp_identify_packet_loss(sk, ack_flag); if (icsk->icsk_ca_state != TCP_CA_Recovery) { if (!tcp_time_to_recover(tp)) return; /* Undo reverts the recovery state. If loss is evident, * starts a new recovery (e.g. reordering then loss); */ tcp_enter_recovery(sk, ece_ack); } break; case TCP_CA_Loss: tcp_process_loss(sk, flag, num_dupack, rexmit); if (icsk->icsk_ca_state != TCP_CA_Loss) tcp_update_rto_time(tp); tcp_identify_packet_loss(sk, ack_flag); if (!(icsk->icsk_ca_state == TCP_CA_Open || (*ack_flag & FLAG_LOST_RETRANS))) return; /* Change state if cwnd is undone or retransmits are lost */ fallthrough; default: if (tcp_is_reno(tp)) { if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); tcp_add_reno_sack(sk, num_dupack, ece_ack); } if (icsk->icsk_ca_state <= TCP_CA_Disorder) tcp_try_undo_dsack(sk); tcp_identify_packet_loss(sk, ack_flag); if (!tcp_time_to_recover(tp)) { tcp_try_to_open(sk, flag); return; } /* MTU probe failure: don't reduce cwnd */ if (icsk->icsk_ca_state < TCP_CA_CWR && icsk->icsk_mtup.probe_size && tp->snd_una == tp->mtu_probe.probe_seq_start) { tcp_mtup_probe_failed(sk); /* Restores the reduction we did in tcp_mtup_probe() */ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1); tcp_simple_retransmit(sk); return; } /* Otherwise enter Recovery state */ tcp_enter_recovery(sk, ece_ack); } *rexmit = REXMIT_LOST; } static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag) { u32 wlen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen) * HZ; struct tcp_sock *tp = tcp_sk(sk); if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) { /* If the remote keeps returning delayed ACKs, eventually * the min filter would pick it up and overestimate the * prop. delay when it expires. Skip suspected delayed ACKs. */ return; } minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32, rtt_us ? : jiffies_to_usecs(1)); } static bool tcp_ack_update_rtt(struct sock *sk, const int flag, long seq_rtt_us, long sack_rtt_us, long ca_rtt_us, struct rate_sample *rs) { const struct tcp_sock *tp = tcp_sk(sk); /* Prefer RTT measured from ACK's timing to TS-ECR. This is because * broken middle-boxes or peers may corrupt TS-ECR fields. But * Karn's algorithm forbids taking RTT if some retransmitted data * is acked (RFC6298). */ if (seq_rtt_us < 0) seq_rtt_us = sack_rtt_us; /* RTTM Rule: A TSecr value received in a segment is used to * update the averaged RTT measurement only if the segment * acknowledges some new data, i.e., only if it advances the * left edge of the send window. * See draft-ietf-tcplw-high-performance-00, section 3.3. */ if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED) seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp, 1); rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */ if (seq_rtt_us < 0) return false; /* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is * always taken together with ACK, SACK, or TS-opts. Any negative * values will be skipped with the seq_rtt_us < 0 check above. */ tcp_update_rtt_min(sk, ca_rtt_us, flag); tcp_rtt_estimator(sk, seq_rtt_us); tcp_set_rto(sk); /* RFC6298: only reset backoff on valid RTT measurement. */ inet_csk(sk)->icsk_backoff = 0; return true; } /* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req) { struct rate_sample rs; long rtt_us = -1L; if (req && !req->num_retrans && tcp_rsk(req)->snt_synack) rtt_us = tcp_stamp_us_delta(tcp_clock_us(), tcp_rsk(req)->snt_synack); tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us, &rs); } static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { const struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_ca_ops->cong_avoid(sk, ack, acked); tcp_sk(sk)->snd_cwnd_stamp = tcp_jiffies32; } /* Restart timer after forward progress on connection. * RFC2988 recommends to restart timer to now+rto. */ void tcp_rearm_rto(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); /* If the retrans timer is currently being used by Fast Open * for SYN-ACK retrans purpose, stay put. */ if (rcu_access_pointer(tp->fastopen_rsk)) return; if (!tp->packets_out) { inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); } else { u32 rto = inet_csk(sk)->icsk_rto; /* Offset the time elapsed after installing regular RTO */ if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { s64 delta_us = tcp_rto_delta_us(sk); /* delta_us may not be positive if the socket is locked * when the retrans timer fires and is rescheduled. */ rto = usecs_to_jiffies(max_t(int, delta_us, 1)); } tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, true); } } /* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */ static void tcp_set_xmit_timer(struct sock *sk) { if (!tcp_schedule_loss_probe(sk, true)) tcp_rearm_rto(sk); } /* If we get here, the whole TSO packet has not been acked. */ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); u32 packets_acked; BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)); packets_acked = tcp_skb_pcount(skb); if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) return 0; packets_acked -= tcp_skb_pcount(skb); if (packets_acked) { BUG_ON(tcp_skb_pcount(skb) == 0); BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)); } return packets_acked; } static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, const struct sk_buff *ack_skb, u32 prior_snd_una) { const struct skb_shared_info *shinfo; /* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */ if (likely(!TCP_SKB_CB(skb)->txstamp_ack)) return; shinfo = skb_shinfo(skb); if (!before(shinfo->tskey, prior_snd_una) && before(shinfo->tskey, tcp_sk(sk)->snd_una)) { tcp_skb_tsorted_save(skb) { __skb_tstamp_tx(skb, ack_skb, NULL, sk, SCM_TSTAMP_ACK); } tcp_skb_tsorted_restore(skb); } } /* Remove acknowledged frames from the retransmission queue. If our packet * is before the ack sequence we can discard it as it's confirmed to have * arrived at the other end. */ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, u32 prior_fack, u32 prior_snd_una, struct tcp_sacktag_state *sack, bool ece_ack) { const struct inet_connection_sock *icsk = inet_csk(sk); u64 first_ackt, last_ackt; struct tcp_sock *tp = tcp_sk(sk); u32 prior_sacked = tp->sacked_out; u32 reord = tp->snd_nxt; /* lowest acked un-retx un-sacked seq */ struct sk_buff *skb, *next; bool fully_acked = true; long sack_rtt_us = -1L; long seq_rtt_us = -1L; long ca_rtt_us = -1L; u32 pkts_acked = 0; bool rtt_update; int flag = 0; first_ackt = 0; for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); const u32 start_seq = scb->seq; u8 sacked = scb->sacked; u32 acked_pcount; /* Determine how many packets and what bytes were acked, tso and else */ if (after(scb->end_seq, tp->snd_una)) { if (tcp_skb_pcount(skb) == 1 || !after(tp->snd_una, scb->seq)) break; acked_pcount = tcp_tso_acked(sk, skb); if (!acked_pcount) break; fully_acked = false; } else { acked_pcount = tcp_skb_pcount(skb); } if (unlikely(sacked & TCPCB_RETRANS)) { if (sacked & TCPCB_SACKED_RETRANS) tp->retrans_out -= acked_pcount; flag |= FLAG_RETRANS_DATA_ACKED; } else if (!(sacked & TCPCB_SACKED_ACKED)) { last_ackt = tcp_skb_timestamp_us(skb); WARN_ON_ONCE(last_ackt == 0); if (!first_ackt) first_ackt = last_ackt; if (before(start_seq, reord)) reord = start_seq; if (!after(scb->end_seq, tp->high_seq)) flag |= FLAG_ORIG_SACK_ACKED; } if (sacked & TCPCB_SACKED_ACKED) { tp->sacked_out -= acked_pcount; /* snd_una delta covers these skbs */ sack->delivered_bytes -= skb->len; } else if (tcp_is_sack(tp)) { tcp_count_delivered(tp, acked_pcount, ece_ack); if (!tcp_skb_spurious_retrans(tp, skb)) tcp_rack_advance(tp, sacked, scb->end_seq, tcp_skb_timestamp_us(skb)); } if (sacked & TCPCB_LOST) tp->lost_out -= acked_pcount; tp->packets_out -= acked_pcount; pkts_acked += acked_pcount; tcp_rate_skb_delivered(sk, skb, sack->rate); /* Initial outgoing SYN's get put onto the write_queue * just like anything else we transmit. It is not * true data, and if we misinform our callers that * this ACK acks real data, we will erroneously exit * connection startup slow start one packet too * quickly. This is severely frowned upon behavior. */ if (likely(!(scb->tcp_flags & TCPHDR_SYN))) { flag |= FLAG_DATA_ACKED; } else { flag |= FLAG_SYN_ACKED; tp->retrans_stamp = 0; } if (!fully_acked) break; tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una); next = skb_rb_next(skb); if (unlikely(skb == tp->retransmit_skb_hint)) tp->retransmit_skb_hint = NULL; tcp_highest_sack_replace(sk, skb, next); tcp_rtx_queue_unlink_and_free(skb, sk); } if (!skb) tcp_chrono_stop(sk, TCP_CHRONO_BUSY); if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una))) tp->snd_up = tp->snd_una; if (skb) { tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una); if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) flag |= FLAG_SACK_RENEGING; } if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) { seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt); ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt); if (pkts_acked == 1 && fully_acked && !prior_sacked && (tp->snd_una - prior_snd_una) < tp->mss_cache && sack->rate->prior_delivered + 1 == tp->delivered && !(flag & (FLAG_CA_ALERT | FLAG_SYN_ACKED))) { /* Conservatively mark a delayed ACK. It's typically * from a lone runt packet over the round trip to * a receiver w/o out-of-order or CE events. */ flag |= FLAG_ACK_MAYBE_DELAYED; } } if (sack->first_sackt) { sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt); ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->last_sackt); } rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us, ca_rtt_us, sack->rate); if (flag & FLAG_ACKED) { flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */ if (unlikely(icsk->icsk_mtup.probe_size && !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) { tcp_mtup_probe_success(sk); } if (tcp_is_reno(tp)) { tcp_remove_reno_sacks(sk, pkts_acked, ece_ack); /* If any of the cumulatively ACKed segments was * retransmitted, non-SACK case cannot confirm that * progress was due to original transmission due to * lack of TCPCB_SACKED_ACKED bits even if some of * the packets may have been never retransmitted. */ if (flag & FLAG_RETRANS_DATA_ACKED) flag &= ~FLAG_ORIG_SACK_ACKED; } else { /* Non-retransmitted hole got filled? That's reordering */ if (before(reord, prior_fack)) tcp_check_sack_reordering(sk, reord, 0); } sack->delivered_bytes = (skb ? TCP_SKB_CB(skb)->seq : tp->snd_una) - prior_snd_una; } else if (skb && rtt_update && sack_rtt_us >= 0 && sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(skb))) { /* Do not re-arm RTO if the sack RTT is measured from data sent * after when the head was last (re)transmitted. Otherwise the * timeout may continue to extend in loss recovery. */ flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */ } if (icsk->icsk_ca_ops->pkts_acked) { struct ack_sample sample = { .pkts_acked = pkts_acked, .rtt_us = sack->rate->rtt_us }; sample.in_flight = tp->mss_cache * (tp->delivered - sack->rate->prior_delivered); icsk->icsk_ca_ops->pkts_acked(sk, &sample); } #if FASTRETRANS_DEBUG > 0 WARN_ON((int)tp->sacked_out < 0); WARN_ON((int)tp->lost_out < 0); WARN_ON((int)tp->retrans_out < 0); if (!tp->packets_out && tcp_is_sack(tp)) { icsk = inet_csk(sk); if (tp->lost_out) { pr_debug("Leak l=%u %d\n", tp->lost_out, icsk->icsk_ca_state); tp->lost_out = 0; } if (tp->sacked_out) { pr_debug("Leak s=%u %d\n", tp->sacked_out, icsk->icsk_ca_state); tp->sacked_out = 0; } if (tp->retrans_out) { pr_debug("Leak r=%u %d\n", tp->retrans_out, icsk->icsk_ca_state); tp->retrans_out = 0; } } #endif return flag; } static void tcp_ack_probe(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *head = tcp_send_head(sk); const struct tcp_sock *tp = tcp_sk(sk); /* Was it a usable window open? */ if (!head) return; if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) { icsk->icsk_backoff = 0; icsk->icsk_probes_tstamp = 0; inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); /* Socket must be waked up by subsequent tcp_data_snd_check(). * This function is not for random using! */ } else { unsigned long when = tcp_probe0_when(sk, tcp_rto_max(sk)); when = tcp_clamp_probe0_to_user_timeout(sk, when); tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, true); } } static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag) { return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open; } /* Decide wheather to run the increase function of congestion control. */ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) { /* If reordering is high then always grow cwnd whenever data is * delivered regardless of its ordering. Otherwise stay conservative * and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/ * new SACK or ECE mark may first advance cwnd here and later reduce * cwnd in tcp_fastretrans_alert() based on more states. */ if (tcp_sk(sk)->reordering > READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering)) return flag & FLAG_FORWARD_PROGRESS; return flag & FLAG_DATA_ACKED; } /* The "ultimate" congestion control function that aims to replace the rigid * cwnd increase and decrease control (tcp_cong_avoid,tcp_*cwnd_reduction). * It's called toward the end of processing an ACK with precise rate * information. All transmission or retransmission are delayed afterwards. */ static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked, int flag, const struct rate_sample *rs) { const struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_ops->cong_control) { icsk->icsk_ca_ops->cong_control(sk, ack, flag, rs); return; } if (tcp_in_cwnd_reduction(sk)) { /* Reduce cwnd if state mandates */ tcp_cwnd_reduction(sk, acked_sacked, rs->losses, flag); } else if (tcp_may_raise_cwnd(sk, flag)) { /* Advance cwnd if state allows */ tcp_cong_avoid(sk, ack, acked_sacked); } tcp_update_pacing_rate(sk); } /* Check that window update is acceptable. * The function assumes that snd_una<=ack<=snd_next. */ static inline bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack, const u32 ack_seq, const u32 nwin) { return after(ack, tp->snd_una) || after(ack_seq, tp->snd_wl1) || (ack_seq == tp->snd_wl1 && (nwin > tp->snd_wnd || !nwin)); } static void tcp_snd_sne_update(struct tcp_sock *tp, u32 ack) { #ifdef CONFIG_TCP_AO struct tcp_ao_info *ao; if (!static_branch_unlikely(&tcp_ao_needed.key)) return; ao = rcu_dereference_protected(tp->ao_info, lockdep_sock_is_held((struct sock *)tp)); if (ao && ack < tp->snd_una) { ao->snd_sne++; trace_tcp_ao_snd_sne_update((struct sock *)tp, ao->snd_sne); } #endif } /* If we update tp->snd_una, also update tp->bytes_acked */ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack) { u32 delta = ack - tp->snd_una; sock_owned_by_me((struct sock *)tp); tp->bytes_acked += delta; tcp_snd_sne_update(tp, ack); tp->snd_una = ack; } static void tcp_rcv_sne_update(struct tcp_sock *tp, u32 seq) { #ifdef CONFIG_TCP_AO struct tcp_ao_info *ao; if (!static_branch_unlikely(&tcp_ao_needed.key)) return; ao = rcu_dereference_protected(tp->ao_info, lockdep_sock_is_held((struct sock *)tp)); if (ao && seq < tp->rcv_nxt) { ao->rcv_sne++; trace_tcp_ao_rcv_sne_update((struct sock *)tp, ao->rcv_sne); } #endif } /* If we update tp->rcv_nxt, also update tp->bytes_received */ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq) { u32 delta = seq - tp->rcv_nxt; sock_owned_by_me((struct sock *)tp); tp->bytes_received += delta; tcp_rcv_sne_update(tp, seq); WRITE_ONCE(tp->rcv_nxt, seq); } /* Update our send window. * * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2 * and in FreeBSD. NetBSD's one is even worse.) is wrong. */ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack, u32 ack_seq) { struct tcp_sock *tp = tcp_sk(sk); int flag = 0; u32 nwin = ntohs(tcp_hdr(skb)->window); if (likely(!tcp_hdr(skb)->syn)) nwin <<= tp->rx_opt.snd_wscale; if (tcp_may_update_window(tp, ack, ack_seq, nwin)) { flag |= FLAG_WIN_UPDATE; tcp_update_wl(tp, ack_seq); if (tp->snd_wnd != nwin) { tp->snd_wnd = nwin; /* Note, it is the only place, where * fast path is recovered for sending TCP. */ tp->pred_flags = 0; tcp_fast_path_check(sk); if (!tcp_write_queue_empty(sk)) tcp_slow_start_after_idle_check(sk); if (nwin > tp->max_window) { tp->max_window = nwin; tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie); } } } tcp_snd_una_update(tp, ack); return flag; } static bool __tcp_oow_rate_limited(struct net *net, int mib_idx, u32 *last_oow_ack_time) { /* Paired with the WRITE_ONCE() in this function. */ u32 val = READ_ONCE(*last_oow_ack_time); if (val) { s32 elapsed = (s32)(tcp_jiffies32 - val); if (0 <= elapsed && elapsed < READ_ONCE(net->ipv4.sysctl_tcp_invalid_ratelimit)) { NET_INC_STATS(net, mib_idx); return true; /* rate-limited: don't send yet! */ } } /* Paired with the prior READ_ONCE() and with itself, * as we might be lockless. */ WRITE_ONCE(*last_oow_ack_time, tcp_jiffies32); return false; /* not rate-limited: go ahead, send dupack now! */ } /* Return true if we're currently rate-limiting out-of-window ACKs and * thus shouldn't send a dupack right now. We rate-limit dupacks in * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS * attacks that send repeated SYNs or ACKs for the same connection. To * do this, we do not send a duplicate SYNACK or ACK if the remote * endpoint is sending out-of-window SYNs or pure ACKs at a high rate. */ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, int mib_idx, u32 *last_oow_ack_time) { /* Data packets without SYNs are not likely part of an ACK loop. */ if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) && !tcp_hdr(skb)->syn) return false; return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time); } static void tcp_send_ack_reflect_ect(struct sock *sk, bool accecn_reflector) { struct tcp_sock *tp = tcp_sk(sk); u16 flags = 0; if (accecn_reflector) flags = tcp_accecn_reflector_flags(tp->syn_ect_rcv); __tcp_send_ack(sk, tp->rcv_nxt, flags); } /* RFC 5961 7 [ACK Throttling] */ static void tcp_send_challenge_ack(struct sock *sk, bool accecn_reflector) { struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); u32 count, now, ack_limit; /* First check our per-socket dupack rate limit. */ if (__tcp_oow_rate_limited(net, LINUX_MIB_TCPACKSKIPPEDCHALLENGE, &tp->last_oow_ack_time)) return; ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit); if (ack_limit == INT_MAX) goto send_ack; /* Then check host-wide RFC 5961 rate limit. */ now = jiffies / HZ; if (now != READ_ONCE(net->ipv4.tcp_challenge_timestamp)) { u32 half = (ack_limit + 1) >> 1; WRITE_ONCE(net->ipv4.tcp_challenge_timestamp, now); WRITE_ONCE(net->ipv4.tcp_challenge_count, get_random_u32_inclusive(half, ack_limit + half - 1)); } count = READ_ONCE(net->ipv4.tcp_challenge_count); if (count > 0) { WRITE_ONCE(net->ipv4.tcp_challenge_count, count - 1); send_ack: NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK); tcp_send_ack_reflect_ect(sk, accecn_reflector); } } static void tcp_store_ts_recent(struct tcp_sock *tp) { tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; tp->rx_opt.ts_recent_stamp = ktime_get_seconds(); } static int __tcp_replace_ts_recent(struct tcp_sock *tp, s32 tstamp_delta) { tcp_store_ts_recent(tp); return tstamp_delta > 0 ? FLAG_TS_PROGRESS : 0; } static int tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) { s32 delta; if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) { /* PAWS bug workaround wrt. ACK frames, the PAWS discard * extra check below makes sure this can only happen * for pure ACK frames. -DaveM * * Not only, also it occurs for expired timestamps. */ if (tcp_paws_check(&tp->rx_opt, 0)) { delta = tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent; return __tcp_replace_ts_recent(tp, delta); } } return 0; } /* This routine deals with acks during a TLP episode and ends an episode by * resetting tlp_high_seq. Ref: TLP algorithm in RFC8985 */ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) { struct tcp_sock *tp = tcp_sk(sk); if (before(ack, tp->tlp_high_seq)) return; if (!tp->tlp_retrans) { /* TLP of new data has been acknowledged */ tp->tlp_high_seq = 0; } else if (flag & FLAG_DSACK_TLP) { /* This DSACK means original and TLP probe arrived; no loss */ tp->tlp_high_seq = 0; } else if (after(ack, tp->tlp_high_seq)) { /* ACK advances: there was a loss, so reduce cwnd. Reset * tlp_high_seq in tcp_init_cwnd_reduction() */ tcp_init_cwnd_reduction(sk); tcp_set_ca_state(sk, TCP_CA_CWR); tcp_end_cwnd_reduction(sk); tcp_try_keep_open(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBERECOVERY); } else if (!(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP | FLAG_DATA_SACKED))) { /* Pure dupack: original and TLP probe arrived; no loss */ tp->tlp_high_seq = 0; } } static void tcp_in_ack_event(struct sock *sk, int flag) { const struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_ops->in_ack_event) { u32 ack_ev_flags = 0; if (flag & FLAG_WIN_UPDATE) ack_ev_flags |= CA_ACK_WIN_UPDATE; if (flag & FLAG_SLOWPATH) { ack_ev_flags |= CA_ACK_SLOWPATH; if (flag & FLAG_ECE) ack_ev_flags |= CA_ACK_ECE; } icsk->icsk_ca_ops->in_ack_event(sk, ack_ev_flags); } } /* Congestion control has updated the cwnd already. So if we're in * loss recovery then now we do any new sends (for FRTO) or * retransmits (for CA_Loss or CA_recovery) that make sense. */ static void tcp_xmit_recovery(struct sock *sk, int rexmit) { struct tcp_sock *tp = tcp_sk(sk); if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT) return; if (unlikely(rexmit == REXMIT_NEW)) { __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF); if (after(tp->snd_nxt, tp->high_seq)) return; tp->frto = 0; } tcp_xmit_retransmit_queue(sk); } /* Returns the number of packets newly acked or sacked by the current ACK */ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, u32 ecn_count, int flag) { const struct net *net = sock_net(sk); struct tcp_sock *tp = tcp_sk(sk); u32 delivered; delivered = tp->delivered - prior_delivered; NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered); if (flag & FLAG_ECE) { if (tcp_ecn_mode_rfc3168(tp)) ecn_count = delivered; NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, ecn_count); } return delivered; } /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct tcp_sacktag_state sack_state; struct rate_sample rs = { .prior_delivered = 0 }; u32 prior_snd_una = tp->snd_una; bool is_sack_reneg = tp->is_sack_reneg; u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; int num_dupack = 0; int prior_packets = tp->packets_out; u32 delivered = tp->delivered; u32 lost = tp->lost; int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ u32 ecn_count = 0; /* Did we receive ECE/an AccECN ACE update? */ u32 prior_fack; sack_state.first_sackt = 0; sack_state.rate = &rs; sack_state.sack_delivered = 0; sack_state.delivered_bytes = 0; /* We very likely will need to access rtx queue. */ prefetch(sk->tcp_rtx_queue.rb_node); /* If the ack is older than previous acks * then we can probably ignore it. */ if (before(ack, prior_snd_una)) { u32 max_window; /* do not accept ACK for bytes we never sent. */ max_window = min_t(u64, tp->max_window, tp->bytes_acked); /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ if (before(ack, prior_snd_una - max_window)) { if (!(flag & FLAG_NO_CHALLENGE_ACK)) tcp_send_challenge_ack(sk, false); return -SKB_DROP_REASON_TCP_TOO_OLD_ACK; } goto old_ack; } /* If the ack includes data we haven't sent yet, discard * this segment (RFC793 Section 3.9). */ if (after(ack, tp->snd_nxt)) return -SKB_DROP_REASON_TCP_ACK_UNSENT_DATA; if (after(ack, prior_snd_una)) { flag |= FLAG_SND_UNA_ADVANCED; WRITE_ONCE(icsk->icsk_retransmits, 0); #if IS_ENABLED(CONFIG_TLS_DEVICE) if (static_branch_unlikely(&clean_acked_data_enabled.key)) if (tp->tcp_clean_acked) tp->tcp_clean_acked(sk, ack); #endif } prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; rs.prior_in_flight = tcp_packets_in_flight(tp); /* ts_recent update must be made after we are sure that the packet * is in window. */ if (flag & FLAG_UPDATE_TS_RECENT) flag |= tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) == FLAG_SND_UNA_ADVANCED) { /* Window is constant, pure forward advance. * No more checks are required. * Note, we use the fact that SND.UNA>=SND.WL2. */ tcp_update_wl(tp, ack_seq); tcp_snd_una_update(tp, ack); flag |= FLAG_WIN_UPDATE; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS); } else { if (ack_seq != TCP_SKB_CB(skb)->end_seq) flag |= FLAG_DATA; else NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS); flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); if (TCP_SKB_CB(skb)->sacked) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state); if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) flag |= FLAG_ECE; if (sack_state.sack_delivered) tcp_count_delivered(tp, sack_state.sack_delivered, flag & FLAG_ECE); } /* This is a deviation from RFC3168 since it states that: * "When the TCP data sender is ready to set the CWR bit after reducing * the congestion window, it SHOULD set the CWR bit only on the first * new data packet that it transmits." * We accept CWR on pure ACKs to be more robust * with widely-deployed TCP implementations that do this. */ tcp_ecn_accept_cwr(sk, skb); /* We passed data and got it acked, remove any soft error * log. Something worked... */ if (READ_ONCE(sk->sk_err_soft)) WRITE_ONCE(sk->sk_err_soft, 0); WRITE_ONCE(icsk->icsk_probes_out, 0); tp->rcv_tstamp = tcp_jiffies32; if (!prior_packets) goto no_queue; /* See if we can take anything off of the retransmit queue. */ flag |= tcp_clean_rtx_queue(sk, skb, prior_fack, prior_snd_una, &sack_state, flag & FLAG_ECE); tcp_rack_update_reo_wnd(sk, &rs); if (tcp_ecn_mode_accecn(tp)) ecn_count = tcp_accecn_process(sk, skb, tp->delivered - delivered, sack_state.delivered_bytes, &flag); tcp_in_ack_event(sk, flag); if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); if (tcp_ack_is_dubious(sk, flag)) { if (!(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP | FLAG_DSACKING_ACK))) { num_dupack = 1; /* Consider if pure acks were aggregated in tcp_add_backlog() */ if (!(flag & FLAG_DATA)) num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs); } tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); } /* If needed, reset TLP/RTO timer when RACK doesn't set. */ if (flag & FLAG_SET_XMIT_TIMER) tcp_set_xmit_timer(sk); if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) sk_dst_confirm(sk); delivered = tcp_newly_delivered(sk, delivered, ecn_count, flag); lost = tp->lost - lost; /* freshly marked lost */ rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); tcp_xmit_recovery(sk, rexmit); return 1; no_queue: if (tcp_ecn_mode_accecn(tp)) ecn_count = tcp_accecn_process(sk, skb, tp->delivered - delivered, sack_state.delivered_bytes, &flag); tcp_in_ack_event(sk, flag); /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) { tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); tcp_newly_delivered(sk, delivered, ecn_count, flag); } /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. */ tcp_ack_probe(sk); if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); return 1; old_ack: /* If data was SACKed, tag it and see if we should send more data. * If data was DSACKed, see if we can undo a cwnd reduction. */ if (TCP_SKB_CB(skb)->sacked) { flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state); tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); tcp_newly_delivered(sk, delivered, ecn_count, flag); tcp_xmit_recovery(sk, rexmit); } return 0; } static void tcp_parse_fastopen_option(int len, const unsigned char *cookie, bool syn, struct tcp_fastopen_cookie *foc, bool exp_opt) { /* Valid only in SYN or SYN-ACK with an even length. */ if (!foc || !syn || len < 0 || (len & 1)) return; if (len >= TCP_FASTOPEN_COOKIE_MIN && len <= TCP_FASTOPEN_COOKIE_MAX) memcpy(foc->val, cookie, len); else if (len != 0) len = -1; foc->len = len; foc->exp = exp_opt; } static bool smc_parse_options(const struct tcphdr *th, struct tcp_options_received *opt_rx, const unsigned char *ptr, int opsize) { #if IS_ENABLED(CONFIG_SMC) if (static_branch_unlikely(&tcp_have_smc)) { if (th->syn && !(opsize & 1) && opsize >= TCPOLEN_EXP_SMC_BASE && get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) { opt_rx->smc_ok = 1; return true; } } #endif return false; } /* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped * value on success. */ u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss) { const unsigned char *ptr = (const unsigned char *)(th + 1); int length = (th->doff * 4) - sizeof(struct tcphdr); u16 mss = 0; while (length > 0) { int opcode = *ptr++; int opsize; switch (opcode) { case TCPOPT_EOL: return mss; case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ length--; continue; default: if (length < 2) return mss; opsize = *ptr++; if (opsize < 2) /* "silly options" */ return mss; if (opsize > length) return mss; /* fail on partial options */ if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) { u16 in_mss = get_unaligned_be16(ptr); if (in_mss) { if (user_mss && user_mss < in_mss) in_mss = user_mss; mss = in_mss; } } ptr += opsize - 2; length -= opsize; } } return mss; } /* Look for tcp options. Normally only called on SYN and SYNACK packets. * But, this can also be called on packets in the established flow when * the fast version below fails. */ void tcp_parse_options(const struct net *net, const struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab, struct tcp_fastopen_cookie *foc) { const unsigned char *ptr; const struct tcphdr *th = tcp_hdr(skb); int length = (th->doff * 4) - sizeof(struct tcphdr); ptr = (const unsigned char *)(th + 1); opt_rx->saw_tstamp = 0; opt_rx->accecn = 0; opt_rx->saw_unknown = 0; while (length > 0) { int opcode = *ptr++; int opsize; switch (opcode) { case TCPOPT_EOL: return; case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ length--; continue; default: if (length < 2) return; opsize = *ptr++; if (opsize < 2) /* "silly options" */ return; if (opsize > length) return; /* don't parse partial options */ switch (opcode) { case TCPOPT_MSS: if (opsize == TCPOLEN_MSS && th->syn && !estab) { u16 in_mss = get_unaligned_be16(ptr); if (in_mss) { if (opt_rx->user_mss && opt_rx->user_mss < in_mss) in_mss = opt_rx->user_mss; opt_rx->mss_clamp = in_mss; } } break; case TCPOPT_WINDOW: if (opsize == TCPOLEN_WINDOW && th->syn && !estab && READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) { __u8 snd_wscale = *(__u8 *)ptr; opt_rx->wscale_ok = 1; if (snd_wscale > TCP_MAX_WSCALE) { net_info_ratelimited("%s: Illegal window scaling value %d > %u received\n", __func__, snd_wscale, TCP_MAX_WSCALE); snd_wscale = TCP_MAX_WSCALE; } opt_rx->snd_wscale = snd_wscale; } break; case TCPOPT_TIMESTAMP: if ((opsize == TCPOLEN_TIMESTAMP) && ((estab && opt_rx->tstamp_ok) || (!estab && READ_ONCE(net->ipv4.sysctl_tcp_timestamps)))) { opt_rx->saw_tstamp = 1; opt_rx->rcv_tsval = get_unaligned_be32(ptr); opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4); } break; case TCPOPT_SACK_PERM: if (opsize == TCPOLEN_SACK_PERM && th->syn && !estab && READ_ONCE(net->ipv4.sysctl_tcp_sack)) { opt_rx->sack_ok = TCP_SACK_SEEN; tcp_sack_reset(opt_rx); } break; case TCPOPT_SACK: if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) && !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) && opt_rx->sack_ok) { TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th; } break; #ifdef CONFIG_TCP_MD5SIG case TCPOPT_MD5SIG: /* The MD5 Hash has already been * checked (see tcp_v{4,6}_rcv()). */ break; #endif #ifdef CONFIG_TCP_AO case TCPOPT_AO: /* TCP AO has already been checked * (see tcp_inbound_ao_hash()). */ break; #endif case TCPOPT_FASTOPEN: tcp_parse_fastopen_option( opsize - TCPOLEN_FASTOPEN_BASE, ptr, th->syn, foc, false); break; case TCPOPT_ACCECN0: case TCPOPT_ACCECN1: /* Save offset of AccECN option in TCP header */ opt_rx->accecn = (ptr - 2) - (__u8 *)th; break; case TCPOPT_EXP: /* Fast Open option shares code 254 using a * 16 bits magic number. */ if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE && get_unaligned_be16(ptr) == TCPOPT_FASTOPEN_MAGIC) { tcp_parse_fastopen_option(opsize - TCPOLEN_EXP_FASTOPEN_BASE, ptr + 2, th->syn, foc, true); break; } if (smc_parse_options(th, opt_rx, ptr, opsize)) break; opt_rx->saw_unknown = 1; break; default: opt_rx->saw_unknown = 1; } ptr += opsize-2; length -= opsize; } } } EXPORT_SYMBOL(tcp_parse_options); static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th) { const __be32 *ptr = (const __be32 *)(th + 1); if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { tp->rx_opt.saw_tstamp = 1; ++ptr; tp->rx_opt.rcv_tsval = ntohl(*ptr); ++ptr; if (*ptr) tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset; else tp->rx_opt.rcv_tsecr = 0; return true; } return false; } /* Fast parse options. This hopes to only see timestamps. * If it is wrong it falls back on tcp_parse_options(). */ static bool tcp_fast_parse_options(const struct net *net, const struct sk_buff *skb, const struct tcphdr *th, struct tcp_sock *tp) { /* In the spirit of fast parsing, compare doff directly to constant * values. Because equality is used, short doff can be ignored here. */ if (th->doff == (sizeof(*th) / 4)) { tp->rx_opt.saw_tstamp = 0; tp->rx_opt.accecn = 0; return false; } else if (tp->rx_opt.tstamp_ok && th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) { if (tcp_parse_aligned_timestamp(tp, th)) { tp->rx_opt.accecn = 0; return true; } } tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL); if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) tp->rx_opt.rcv_tsecr -= tp->tsoffset; return true; } #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) /* * Parse Signature options */ int tcp_do_parse_auth_options(const struct tcphdr *th, const u8 **md5_hash, const u8 **ao_hash) { int length = (th->doff << 2) - sizeof(*th); const u8 *ptr = (const u8 *)(th + 1); unsigned int minlen = TCPOLEN_MD5SIG; if (IS_ENABLED(CONFIG_TCP_AO)) minlen = sizeof(struct tcp_ao_hdr) + 1; *md5_hash = NULL; *ao_hash = NULL; /* If not enough data remaining, we can short cut */ while (length >= minlen) { int opcode = *ptr++; int opsize; switch (opcode) { case TCPOPT_EOL: return 0; case TCPOPT_NOP: length--; continue; default: opsize = *ptr++; if (opsize < 2 || opsize > length) return -EINVAL; if (opcode == TCPOPT_MD5SIG) { if (opsize != TCPOLEN_MD5SIG) return -EINVAL; if (unlikely(*md5_hash || *ao_hash)) return -EEXIST; *md5_hash = ptr; } else if (opcode == TCPOPT_AO) { if (opsize <= sizeof(struct tcp_ao_hdr)) return -EINVAL; if (unlikely(*md5_hash || *ao_hash)) return -EEXIST; *ao_hash = ptr; } } ptr += opsize - 2; length -= opsize; } return 0; } EXPORT_SYMBOL(tcp_do_parse_auth_options); #endif /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM * * It is not fatal. If this ACK does _not_ change critical state (seqs, window) * it can pass through stack. So, the following predicate verifies that * this segment is not used for anything but congestion avoidance or * fast retransmit. Moreover, we even are able to eliminate most of such * second order effects, if we apply some small "replay" window (~RTO) * to timestamp space. * * All these measures still do not guarantee that we reject wrapped ACKs * on networks with high bandwidth, when sequence space is recycled fastly, * but it guarantees that such events will be very rare and do not affect * connection seriously. This doesn't look nice, but alas, PAWS is really * buggy extension. * * [ Later note. Even worse! It is buggy for segments _with_ data. RFC * states that events when retransmit arrives after original data are rare. * It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is * the biggest problem on large power networks even with minor reordering. * OK, let's give it small replay window. If peer clock is even 1hz, it is safe * up to bandwidth of 18Gigabit/sec. 8) ] */ /* Estimates max number of increments of remote peer TSval in * a replay window (based on our current RTO estimation). */ static u32 tcp_tsval_replay(const struct sock *sk) { /* If we use usec TS resolution, * then expect the remote peer to use the same resolution. */ if (tcp_sk(sk)->tcp_usec_ts) return inet_csk(sk)->icsk_rto * (USEC_PER_SEC / HZ); /* RFC 7323 recommends a TSval clock between 1ms and 1sec. * We know that some OS (including old linux) can use 1200 Hz. */ return inet_csk(sk)->icsk_rto * 1200 / HZ; } static enum skb_drop_reason tcp_disordered_ack_check(const struct sock *sk, const struct sk_buff *skb) { const struct tcp_sock *tp = tcp_sk(sk); const struct tcphdr *th = tcp_hdr(skb); SKB_DR_INIT(reason, TCP_RFC7323_PAWS); u32 ack = TCP_SKB_CB(skb)->ack_seq; u32 seq = TCP_SKB_CB(skb)->seq; /* 1. Is this not a pure ACK ? */ if (!th->ack || seq != TCP_SKB_CB(skb)->end_seq) return reason; /* 2. Is its sequence not the expected one ? */ if (seq != tp->rcv_nxt) return before(seq, tp->rcv_nxt) ? SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK : reason; /* 3. Is this not a duplicate ACK ? */ if (ack != tp->snd_una) return reason; /* 4. Is this updating the window ? */ if (tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale)) return reason; /* 5. Is this not in the replay window ? */ if ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > tcp_tsval_replay(sk)) return reason; return 0; } /* Check segment sequence number for validity. * * Segment controls are considered valid, if the segment * fits to the window after truncation to the window. Acceptability * of data (and SYN, FIN, of course) is checked separately. * See tcp_data_queue(), for example. * * Also, controls (RST is main one) are accepted using RCV.WUP instead * of RCV.NXT. Peer still did not advance his SND.UNA when we * delayed ACK, so that hisSND.UNA<=ourRCV.WUP. * (borrowed from freebsd) */ static enum skb_drop_reason tcp_sequence(const struct sock *sk, u32 seq, u32 end_seq) { const struct tcp_sock *tp = tcp_sk(sk); if (before(end_seq, tp->rcv_wup)) return SKB_DROP_REASON_TCP_OLD_SEQUENCE; if (after(end_seq, tp->rcv_nxt + tcp_receive_window(tp))) { if (after(seq, tp->rcv_nxt + tcp_receive_window(tp))) return SKB_DROP_REASON_TCP_INVALID_SEQUENCE; /* Only accept this packet if receive queue is empty. */ if (skb_queue_len(&sk->sk_receive_queue)) return SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE; } return SKB_NOT_DROPPED_YET; } void tcp_done_with_error(struct sock *sk, int err) { /* This barrier is coupled with smp_rmb() in tcp_poll() */ WRITE_ONCE(sk->sk_err, err); smp_wmb(); tcp_write_queue_purge(sk); tcp_done(sk); if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } EXPORT_IPV6_MOD(tcp_done_with_error); /* When we get a reset we do this. */ void tcp_reset(struct sock *sk, struct sk_buff *skb) { int err; trace_tcp_receive_reset(sk); /* mptcp can't tell us to ignore reset pkts, * so just ignore the return value of mptcp_incoming_options(). */ if (sk_is_mptcp(sk)) mptcp_incoming_options(sk, skb); /* We want the right error as BSD sees it (and indeed as we do). */ switch (sk->sk_state) { case TCP_SYN_SENT: err = ECONNREFUSED; break; case TCP_CLOSE_WAIT: err = EPIPE; break; case TCP_CLOSE: return; default: err = ECONNRESET; } tcp_done_with_error(sk, err); } /* * Process the FIN bit. This now behaves as it is supposed to work * and the FIN takes effect when it is validly part of sequence * space. Not before when we get holes. * * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT * (and thence onto LAST-ACK and finally, CLOSE, we never enter * TIME-WAIT) * * If we are in FINWAIT-1, a received FIN indicates simultaneous * close and we go into CLOSING (and later onto TIME-WAIT) * * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. */ void tcp_fin(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); inet_csk_schedule_ack(sk); WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN); sock_set_flag(sk, SOCK_DONE); switch (sk->sk_state) { case TCP_SYN_RECV: case TCP_ESTABLISHED: /* Move to CLOSE_WAIT */ tcp_set_state(sk, TCP_CLOSE_WAIT); inet_csk_enter_pingpong_mode(sk); break; case TCP_CLOSE_WAIT: case TCP_CLOSING: /* Received a retransmission of the FIN, do * nothing. */ break; case TCP_LAST_ACK: /* RFC793: Remain in the LAST-ACK state. */ break; case TCP_FIN_WAIT1: /* This case occurs when a simultaneous close * happens, we must ack the received FIN and * enter the CLOSING state. */ tcp_send_ack(sk); tcp_set_state(sk, TCP_CLOSING); break; case TCP_FIN_WAIT2: /* Received a FIN -- send ACK and enter TIME_WAIT. */ tcp_send_ack(sk); tcp_time_wait(sk, TCP_TIME_WAIT, 0); break; default: /* Only TCP_LISTEN and TCP_CLOSE are left, in these * cases we should never reach this piece of code. */ pr_err("%s: Impossible, sk->sk_state=%d\n", __func__, sk->sk_state); break; } /* It _is_ possible, that we have something out-of-order _after_ FIN. * Probably, we should reset in this case. For now drop them. */ skb_rbtree_purge(&tp->out_of_order_queue); if (tcp_is_sack(tp)) tcp_sack_reset(&tp->rx_opt); if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); /* Do not send POLL_HUP for half duplex close. */ if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); else sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); } } static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq) { if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) { if (before(seq, sp->start_seq)) sp->start_seq = seq; if (after(end_seq, sp->end_seq)) sp->end_seq = end_seq; return true; } return false; } static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) { struct tcp_sock *tp = tcp_sk(sk); if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) { int mib_idx; if (before(seq, tp->rcv_nxt)) mib_idx = LINUX_MIB_TCPDSACKOLDSENT; else mib_idx = LINUX_MIB_TCPDSACKOFOSENT; NET_INC_STATS(sock_net(sk), mib_idx); tp->rx_opt.dsack = 1; tp->duplicate_sack[0].start_seq = seq; tp->duplicate_sack[0].end_seq = end_seq; } } static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq) { struct tcp_sock *tp = tcp_sk(sk); if (!tp->rx_opt.dsack) tcp_dsack_set(sk, seq, end_seq); else tcp_sack_extend(tp->duplicate_sack, seq, end_seq); } static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) { /* When the ACK path fails or drops most ACKs, the sender would * timeout and spuriously retransmit the same segment repeatedly. * If it seems our ACKs are not reaching the other side, * based on receiving a duplicate data segment with new flowlabel * (suggesting the sender suffered an RTO), and we are not already * repathing due to our own RTO, then rehash the socket to repath our * packets. */ #if IS_ENABLED(CONFIG_IPV6) if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss && skb->protocol == htons(ETH_P_IPV6) && (tcp_sk(sk)->inet_conn.icsk_ack.lrcv_flowlabel != ntohl(ip6_flowlabel(ipv6_hdr(skb)))) && sk_rethink_txhash(sk)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH); /* Save last flowlabel after a spurious retrans. */ tcp_save_lrcv_flowlabel(sk, skb); #endif } static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; tcp_rcv_spurious_retrans(sk, skb); if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) end_seq = tp->rcv_nxt; tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq); } } tcp_send_ack(sk); } /* These routines update the SACK block as out-of-order packets arrive or * in-order packets close up the sequence space. */ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) { int this_sack; struct tcp_sack_block *sp = &tp->selective_acks[0]; struct tcp_sack_block *swalk = sp + 1; /* See if the recent change to the first SACK eats into * or hits the sequence space of other SACK blocks, if so coalesce. */ for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) { if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) { int i; /* Zap SWALK, by moving every further SACK up by one slot. * Decrease num_sacks. */ tp->rx_opt.num_sacks--; for (i = this_sack; i < tp->rx_opt.num_sacks; i++) sp[i] = sp[i + 1]; continue; } this_sack++; swalk++; } } void tcp_sack_compress_send_ack(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (!tp->compressed_ack) return; if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) __sock_put(sk); /* Since we have to send one ack finally, * substract one from tp->compressed_ack to keep * LINUX_MIB_TCPACKCOMPRESSED accurate. */ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, tp->compressed_ack - 1); tp->compressed_ack = 0; tcp_send_ack(sk); } /* Reasonable amount of sack blocks included in TCP SACK option * The max is 4, but this becomes 3 if TCP timestamps are there. * Given that SACK packets might be lost, be conservative and use 2. */ #define TCP_SACK_BLOCKS_EXPECTED 2 static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_sack_block *sp = &tp->selective_acks[0]; int cur_sacks = tp->rx_opt.num_sacks; int this_sack; if (!cur_sacks) goto new_sack; for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) { if (tcp_sack_extend(sp, seq, end_seq)) { if (this_sack >= TCP_SACK_BLOCKS_EXPECTED) tcp_sack_compress_send_ack(sk); /* Rotate this_sack to the first one. */ for (; this_sack > 0; this_sack--, sp--) swap(*sp, *(sp - 1)); if (cur_sacks > 1) tcp_sack_maybe_coalesce(tp); return; } } if (this_sack >= TCP_SACK_BLOCKS_EXPECTED) tcp_sack_compress_send_ack(sk); /* Could not find an adjacent existing SACK, build a new one, * put it at the front, and shift everyone else down. We * always know there is at least one SACK present already here. * * If the sack array is full, forget about the last one. */ if (this_sack >= TCP_NUM_SACKS) { this_sack--; tp->rx_opt.num_sacks--; sp--; } for (; this_sack > 0; this_sack--, sp--) *sp = *(sp - 1); new_sack: /* Build the new head SACK, and we're done. */ sp->start_seq = seq; sp->end_seq = end_seq; tp->rx_opt.num_sacks++; } /* RCV.NXT advances, some SACKs should be eaten. */ static void tcp_sack_remove(struct tcp_sock *tp) { struct tcp_sack_block *sp = &tp->selective_acks[0]; int num_sacks = tp->rx_opt.num_sacks; int this_sack; /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */ if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) { tp->rx_opt.num_sacks = 0; return; } for (this_sack = 0; this_sack < num_sacks;) { /* Check if the start of the sack is covered by RCV.NXT. */ if (!before(tp->rcv_nxt, sp->start_seq)) { int i; /* RCV.NXT must cover all the block! */ WARN_ON(before(tp->rcv_nxt, sp->end_seq)); /* Zap this SACK, by moving forward any other SACKS. */ for (i = this_sack+1; i < num_sacks; i++) tp->selective_acks[i-1] = tp->selective_acks[i]; num_sacks--; continue; } this_sack++; sp++; } tp->rx_opt.num_sacks = num_sacks; } /** * tcp_try_coalesce - try to merge skb to prior one * @sk: socket * @to: prior buffer * @from: buffer to add in queue * @fragstolen: pointer to boolean * * Before queueing skb @from after @to, try to merge them * to reduce overall memory use and queue lengths, if cost is small. * Packets in ofo or receive queues can stay a long time. * Better try to coalesce them right now to avoid future collapses. * Returns true if caller should free @from instead of queueing it */ static bool tcp_try_coalesce(struct sock *sk, struct sk_buff *to, struct sk_buff *from, bool *fragstolen) { int delta; *fragstolen = false; /* Its possible this segment overlaps with prior segment in queue */ if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) return false; if (!tcp_skb_can_collapse_rx(to, from)) return false; if (!skb_try_coalesce(to, from, fragstolen, &delta)) return false; atomic_add(delta, &sk->sk_rmem_alloc); sk_mem_charge(sk, delta); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags; if (TCP_SKB_CB(from)->has_rxtstamp) { TCP_SKB_CB(to)->has_rxtstamp = true; to->tstamp = from->tstamp; skb_hwtstamps(to)->hwtstamp = skb_hwtstamps(from)->hwtstamp; } return true; } static bool tcp_ooo_try_coalesce(struct sock *sk, struct sk_buff *to, struct sk_buff *from, bool *fragstolen) { bool res = tcp_try_coalesce(sk, to, from, fragstolen); /* In case tcp_drop_reason() is called later, update to->gso_segs */ if (res) { u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) + max_t(u16, 1, skb_shinfo(from)->gso_segs); skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF); } return res; } noinline_for_tracing static void tcp_drop_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason) { sk_drops_skbadd(sk, skb); sk_skb_reason_drop(sk, skb, reason); } /* This one checks to see if we can put data from the * out_of_order queue into the receive_queue. */ static void tcp_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); __u32 dsack_high = tp->rcv_nxt; bool fin, fragstolen, eaten; struct sk_buff *skb, *tail; struct rb_node *p; p = rb_first(&tp->out_of_order_queue); while (p) { skb = rb_to_skb(p); if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) break; if (before(TCP_SKB_CB(skb)->seq, dsack_high)) { __u32 dsack = dsack_high; if (before(TCP_SKB_CB(skb)->end_seq, dsack_high)) dsack = TCP_SKB_CB(skb)->end_seq; tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); } p = rb_next(p); rb_erase(&skb->rbnode, &tp->out_of_order_queue); if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { tcp_drop_reason(sk, skb, SKB_DROP_REASON_TCP_OFO_DROP); continue; } tail = skb_peek_tail(&sk->sk_receive_queue); eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (!eaten) tcp_add_receive_queue(sk, skb); else kfree_skb_partial(skb, fragstolen); if (unlikely(fin)) { tcp_fin(sk); /* tcp_fin() purges tp->out_of_order_queue, * so we must end this loop right now. */ break; } } } static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb); static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb); /* Check if this incoming skb can be added to socket receive queues * while satisfying sk->sk_rcvbuf limit. * * In theory we should use skb->truesize, but this can cause problems * when applications use too small SO_RCVBUF values. * When LRO / hw gro is used, the socket might have a high tp->scaling_ratio, * allowing RWIN to be close to available space. * Whenever the receive queue gets full, we can receive a small packet * filling RWIN, but with a high skb->truesize, because most NIC use 4K page * plus sk_buff metadata even when receiving less than 1500 bytes of payload. * * Note that we use skb->len to decide to accept or drop this packet, * but sk->sk_rmem_alloc is the sum of all skb->truesize. */ static bool tcp_can_ingest(const struct sock *sk, const struct sk_buff *skb) { unsigned int rmem = atomic_read(&sk->sk_rmem_alloc); return rmem + skb->len <= sk->sk_rcvbuf; } static int tcp_try_rmem_schedule(struct sock *sk, const struct sk_buff *skb, unsigned int size) { if (!tcp_can_ingest(sk, skb) || !sk_rmem_schedule(sk, skb, size)) { if (tcp_prune_queue(sk, skb) < 0) return -1; while (!sk_rmem_schedule(sk, skb, size)) { if (!tcp_prune_ofo_queue(sk, skb)) return -1; } } return 0; } static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct rb_node **p, *parent; struct sk_buff *skb1; u32 seq, end_seq; bool fragstolen; tcp_save_lrcv_flowlabel(sk, skb); tcp_data_ecn_check(sk, skb); if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP); sk->sk_data_ready(sk); tcp_drop_reason(sk, skb, SKB_DROP_REASON_PROTO_MEM); return; } tcp_measure_rcv_mss(sk, skb); /* Disable header prediction. */ tp->pred_flags = 0; inet_csk_schedule_ack(sk); tp->rcv_ooopack += max_t(u16, 1, skb_shinfo(skb)->gso_segs); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); seq = TCP_SKB_CB(skb)->seq; end_seq = TCP_SKB_CB(skb)->end_seq; p = &tp->out_of_order_queue.rb_node; if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) { /* Initial out of order segment, build 1 SACK. */ if (tcp_is_sack(tp)) { tp->rx_opt.num_sacks = 1; tp->selective_acks[0].start_seq = seq; tp->selective_acks[0].end_seq = end_seq; } rb_link_node(&skb->rbnode, NULL, p); rb_insert_color(&skb->rbnode, &tp->out_of_order_queue); tp->ooo_last_skb = skb; goto end; } /* In the typical case, we are adding an skb to the end of the list. * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. */ if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) { coalesce_done: /* For non sack flows, do not grow window to force DUPACK * and trigger fast retransmit. */ if (tcp_is_sack(tp)) tcp_grow_window(sk, skb, true); kfree_skb_partial(skb, fragstolen); skb = NULL; goto add_sack; } /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */ if (!before(seq, TCP_SKB_CB(tp->ooo_last_skb)->end_seq)) { parent = &tp->ooo_last_skb->rbnode; p = &parent->rb_right; goto insert; } /* Find place to insert this segment. Handle overlaps on the way. */ parent = NULL; while (*p) { parent = *p; skb1 = rb_to_skb(parent); if (before(seq, TCP_SKB_CB(skb1)->seq)) { p = &parent->rb_left; continue; } if (before(seq, TCP_SKB_CB(skb1)->end_seq)) { if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { /* All the bits are present. Drop. */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); tcp_drop_reason(sk, skb, SKB_DROP_REASON_TCP_OFOMERGE); skb = NULL; tcp_dsack_set(sk, seq, end_seq); goto add_sack; } if (after(seq, TCP_SKB_CB(skb1)->seq)) { /* Partial overlap. */ tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq); } else { /* skb's seq == skb1's seq and skb covers skb1. * Replace skb1 with skb. */ rb_replace_node(&skb1->rbnode, &skb->rbnode, &tp->out_of_order_queue); tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); tcp_drop_reason(sk, skb1, SKB_DROP_REASON_TCP_OFOMERGE); goto merge_right; } } else if (tcp_ooo_try_coalesce(sk, skb1, skb, &fragstolen)) { goto coalesce_done; } p = &parent->rb_right; } insert: /* Insert segment into RB tree. */ rb_link_node(&skb->rbnode, parent, p); rb_insert_color(&skb->rbnode, &tp->out_of_order_queue); merge_right: /* Remove other segments covered by skb. */ while ((skb1 = skb_rb_next(skb)) != NULL) { if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) break; if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, end_seq); break; } rb_erase(&skb1->rbnode, &tp->out_of_order_queue); tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); tcp_drop_reason(sk, skb1, SKB_DROP_REASON_TCP_OFOMERGE); } /* If there is no skb after us, we are the last_skb ! */ if (!skb1) tp->ooo_last_skb = skb; add_sack: if (tcp_is_sack(tp)) tcp_sack_new_ofo_skb(sk, seq, end_seq); end: if (skb) { /* For non sack flows, do not grow window to force DUPACK * and trigger fast retransmit. */ if (tcp_is_sack(tp)) tcp_grow_window(sk, skb, false); skb_condense(skb); skb_set_owner_r(skb, sk); } /* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */ if (sk->sk_socket) tcp_rcvbuf_grow(sk, tp->rcvq_space.space); } static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, bool *fragstolen) { int eaten; struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue); eaten = (tail && tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0; tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq); if (!eaten) { tcp_add_receive_queue(sk, skb); skb_set_owner_r(skb, sk); } return eaten; } int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) { struct sk_buff *skb; int err = -ENOMEM; int data_len = 0; bool fragstolen; if (size == 0) return 0; if (size > PAGE_SIZE) { int npages = min_t(size_t, size >> PAGE_SHIFT, MAX_SKB_FRAGS); data_len = npages << PAGE_SHIFT; size = data_len + (size & ~PAGE_MASK); } skb = alloc_skb_with_frags(size - data_len, data_len, PAGE_ALLOC_COSTLY_ORDER, &err, sk->sk_allocation); if (!skb) goto err; skb_put(skb, size - data_len); skb->data_len = data_len; skb->len = size; if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); goto err_free; } err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); if (err) goto err_free; TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size; TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; if (tcp_queue_rcv(sk, skb, &fragstolen)) { WARN_ON_ONCE(fragstolen); /* should not happen */ __kfree_skb(skb); } return size; err_free: kfree_skb(skb); err: return err; } void tcp_data_ready(struct sock *sk) { if (tcp_epollin_ready(sk, sk->sk_rcvlowat) || sock_flag(sk, SOCK_DONE)) sk->sk_data_ready(sk); } static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); enum skb_drop_reason reason; bool fragstolen; int eaten; /* If a subflow has been reset, the packet should not continue * to be processed, drop the packet. */ if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb)) { __kfree_skb(skb); return; } if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { __kfree_skb(skb); return; } tcp_cleanup_skb(skb); __skb_pull(skb, tcp_hdr(skb)->doff * 4); reason = SKB_DROP_REASON_NOT_SPECIFIED; tp->rx_opt.dsack = 0; /* Queue data for delivery to the user. * Packets in sequence go to the receive queue. * Out of sequence packets to the out_of_order_queue. */ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { if (tcp_receive_window(tp) == 0) { /* Some stacks are known to send bare FIN packets * in a loop even if we send RWIN 0 in our ACK. * Accepting this FIN does not hurt memory pressure * because the FIN flag will simply be merged to the * receive queue tail skb in most cases. */ if (!skb->len && (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) goto queue_and_out; reason = SKB_DROP_REASON_TCP_ZEROWINDOW; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); goto out_of_window; } /* Ok. In sequence. In window. */ queue_and_out: if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { /* TODO: maybe ratelimit these WIN 0 ACK ? */ inet_csk(sk)->icsk_ack.pending |= (ICSK_ACK_NOMEM | ICSK_ACK_NOW); inet_csk_schedule_ack(sk); sk->sk_data_ready(sk); if (skb_queue_len(&sk->sk_receive_queue) && skb->len) { reason = SKB_DROP_REASON_PROTO_MEM; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); goto drop; } sk_forced_mem_schedule(sk, skb->truesize); } eaten = tcp_queue_rcv(sk, skb, &fragstolen); if (skb->len) tcp_event_data_recv(sk, skb); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) tcp_fin(sk); if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) { tcp_ofo_queue(sk); /* RFC5681. 4.2. SHOULD send immediate ACK, when * gap in queue is filled. */ if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; } if (tp->rx_opt.num_sacks) tcp_sack_remove(tp); tcp_fast_path_check(sk); if (eaten > 0) kfree_skb_partial(skb, fragstolen); if (!sock_flag(sk, SOCK_DEAD)) tcp_data_ready(sk); return; } if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { tcp_rcv_spurious_retrans(sk, skb); /* A retransmit, 2nd most common case. Force an immediate ack. */ reason = SKB_DROP_REASON_TCP_OLD_DATA; NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); out_of_window: tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); inet_csk_schedule_ack(sk); drop: tcp_drop_reason(sk, skb, reason); return; } /* Out of window. F.e. zero window probe. */ if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp))) { reason = SKB_DROP_REASON_TCP_OVERWINDOW; goto out_of_window; } if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { /* Partial packet, seq < rcv_next < end_seq */ tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt); /* If window is closed, drop tail of packet. But after * remembering D-SACK for its head made in previous line. */ if (!tcp_receive_window(tp)) { reason = SKB_DROP_REASON_TCP_ZEROWINDOW; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); goto out_of_window; } goto queue_and_out; } tcp_data_queue_ofo(sk, skb); } static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list) { if (list) return !skb_queue_is_last(list, skb) ? skb->next : NULL; return skb_rb_next(skb); } static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, struct sk_buff_head *list, struct rb_root *root) { struct sk_buff *next = tcp_skb_next(skb, list); if (list) __skb_unlink(skb, list); else rb_erase(&skb->rbnode, root); __kfree_skb(skb); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); return next; } /* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */ void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; struct sk_buff *skb1; while (*p) { parent = *p; skb1 = rb_to_skb(parent); if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) p = &parent->rb_left; else p = &parent->rb_right; } rb_link_node(&skb->rbnode, parent, p); rb_insert_color(&skb->rbnode, root); } /* Collapse contiguous sequence of skbs head..tail with * sequence numbers start..end. * * If tail is NULL, this means until the end of the queue. * * Segments with FIN/SYN are not collapsed (only because this * simplifies code) */ static void tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root, struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end) { struct sk_buff *skb = head, *n; struct sk_buff_head tmp; bool end_of_skbs; /* First, check that queue is collapsible and find * the point where collapsing can be useful. */ restart: for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) { n = tcp_skb_next(skb, list); if (!skb_frags_readable(skb)) goto skip_this; /* No new bits? It is possible on ofo queue. */ if (!before(start, TCP_SKB_CB(skb)->end_seq)) { skb = tcp_collapse_one(sk, skb, list, root); if (!skb) break; goto restart; } /* The first skb to collapse is: * - not SYN/FIN and * - bloated or contains data before "start" or * overlaps to the next one and mptcp allow collapsing. */ if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) && (tcp_win_from_space(sk, skb->truesize) > skb->len || before(TCP_SKB_CB(skb)->seq, start))) { end_of_skbs = false; break; } if (n && n != tail && skb_frags_readable(n) && tcp_skb_can_collapse_rx(skb, n) && TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) { end_of_skbs = false; break; } skip_this: /* Decided to skip this, advance start seq. */ start = TCP_SKB_CB(skb)->end_seq; } if (end_of_skbs || (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) || !skb_frags_readable(skb)) return; __skb_queue_head_init(&tmp); while (before(start, end)) { int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start); struct sk_buff *nskb; nskb = alloc_skb(copy, GFP_ATOMIC); if (!nskb) break; memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); skb_copy_decrypted(nskb, skb); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; if (list) __skb_queue_before(list, skb, nskb); else __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */ skb_set_owner_r(nskb, sk); mptcp_skb_ext_move(nskb, skb); /* Copy data, releasing collapsed skbs. */ while (copy > 0) { int offset = start - TCP_SKB_CB(skb)->seq; int size = TCP_SKB_CB(skb)->end_seq - start; BUG_ON(offset < 0); if (size > 0) { size = min(copy, size); if (skb_copy_bits(skb, offset, skb_put(nskb, size), size)) BUG(); TCP_SKB_CB(nskb)->end_seq += size; copy -= size; start += size; } if (!before(start, TCP_SKB_CB(skb)->end_seq)) { skb = tcp_collapse_one(sk, skb, list, root); if (!skb || skb == tail || !tcp_skb_can_collapse_rx(nskb, skb) || (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) || !skb_frags_readable(skb)) goto end; } } } end: skb_queue_walk_safe(&tmp, skb, n) tcp_rbtree_insert(root, skb); } /* Collapse ofo queue. Algorithm: select contiguous sequence of skbs * and tcp_collapse() them until all the queue is collapsed. */ static void tcp_collapse_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); u32 range_truesize, sum_tiny = 0; struct sk_buff *skb, *head; u32 start, end; skb = skb_rb_first(&tp->out_of_order_queue); new_range: if (!skb) { tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue); return; } start = TCP_SKB_CB(skb)->seq; end = TCP_SKB_CB(skb)->end_seq; range_truesize = skb->truesize; for (head = skb;;) { skb = skb_rb_next(skb); /* Range is terminated when we see a gap or when * we are at the queue end. */ if (!skb || after(TCP_SKB_CB(skb)->seq, end) || before(TCP_SKB_CB(skb)->end_seq, start)) { /* Do not attempt collapsing tiny skbs */ if (range_truesize != head->truesize || end - start >= SKB_WITH_OVERHEAD(PAGE_SIZE)) { tcp_collapse(sk, NULL, &tp->out_of_order_queue, head, skb, start, end); } else { sum_tiny += range_truesize; if (sum_tiny > sk->sk_rcvbuf >> 3) return; } goto new_range; } range_truesize += skb->truesize; if (unlikely(before(TCP_SKB_CB(skb)->seq, start))) start = TCP_SKB_CB(skb)->seq; if (after(TCP_SKB_CB(skb)->end_seq, end)) end = TCP_SKB_CB(skb)->end_seq; } } /* * Clean the out-of-order queue to make room. * We drop high sequences packets to : * 1) Let a chance for holes to be filled. * This means we do not drop packets from ooo queue if their sequence * is before incoming packet sequence. * 2) not add too big latencies if thousands of packets sit there. * (But if application shrinks SO_RCVBUF, we could still end up * freeing whole queue here) * 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks. * * Return true if queue has shrunk. */ static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb) { struct tcp_sock *tp = tcp_sk(sk); struct rb_node *node, *prev; bool pruned = false; int goal; if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) return false; goal = sk->sk_rcvbuf >> 3; node = &tp->ooo_last_skb->rbnode; do { struct sk_buff *skb = rb_to_skb(node); /* If incoming skb would land last in ofo queue, stop pruning. */ if (after(TCP_SKB_CB(in_skb)->seq, TCP_SKB_CB(skb)->seq)) break; pruned = true; prev = rb_prev(node); rb_erase(node, &tp->out_of_order_queue); goal -= skb->truesize; tcp_drop_reason(sk, skb, SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE); tp->ooo_last_skb = rb_to_skb(prev); if (!prev || goal <= 0) { if (tcp_can_ingest(sk, in_skb) && !tcp_under_memory_pressure(sk)) break; goal = sk->sk_rcvbuf >> 3; } node = prev; } while (node); if (pruned) { NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); /* Reset SACK state. A conforming SACK implementation will * do the same at a timeout based retransmit. When a connection * is in a sad state like this, we care only about integrity * of the connection not performance. */ if (tp->rx_opt.sack_ok) tcp_sack_reset(&tp->rx_opt); } return pruned; } /* Reduce allocated memory if we can, trying to get * the socket within its memory limits again. * * Return less than zero if we should start dropping frames * until the socket owning process reads some of the data * to stabilize the situation. */ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb) { struct tcp_sock *tp = tcp_sk(sk); /* Do nothing if our queues are empty. */ if (!atomic_read(&sk->sk_rmem_alloc)) return -1; NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED); if (!tcp_can_ingest(sk, in_skb)) tcp_clamp_window(sk); else if (tcp_under_memory_pressure(sk)) tcp_adjust_rcv_ssthresh(sk); if (tcp_can_ingest(sk, in_skb)) return 0; tcp_collapse_ofo_queue(sk); if (!skb_queue_empty(&sk->sk_receive_queue)) tcp_collapse(sk, &sk->sk_receive_queue, NULL, skb_peek(&sk->sk_receive_queue), NULL, tp->copied_seq, tp->rcv_nxt); if (tcp_can_ingest(sk, in_skb)) return 0; /* Collapsing did not help, destructive actions follow. * This must not ever occur. */ tcp_prune_ofo_queue(sk, in_skb); if (tcp_can_ingest(sk, in_skb)) return 0; /* If we are really being abused, tell the caller to silently * drop receive data on the floor. It will get retransmitted * and hopefully then we'll have sufficient space. */ NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED); /* Massive buffer overcommit. */ tp->pred_flags = 0; return -1; } static bool tcp_should_expand_sndbuf(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); /* If the user specified a specific send buffer setting, do * not modify it. */ if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) return false; /* If we are under global TCP memory pressure, do not expand. */ if (tcp_under_memory_pressure(sk)) { int unused_mem = sk_unused_reserved_mem(sk); /* Adjust sndbuf according to reserved mem. But make sure * it never goes below SOCK_MIN_SNDBUF. * See sk_stream_moderate_sndbuf() for more details. */ if (unused_mem > SOCK_MIN_SNDBUF) WRITE_ONCE(sk->sk_sndbuf, unused_mem); return false; } /* If we are under soft global TCP memory pressure, do not expand. */ if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0)) return false; /* If we filled the congestion window, do not expand. */ if (tcp_packets_in_flight(tp) >= tcp_snd_cwnd(tp)) return false; return true; } static void tcp_new_space(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (tcp_should_expand_sndbuf(sk)) { tcp_sndbuf_expand(sk); tp->snd_cwnd_stamp = tcp_jiffies32; } INDIRECT_CALL_1(sk->sk_write_space, sk_stream_write_space, sk); } /* Caller made space either from: * 1) Freeing skbs in rtx queues (after tp->snd_una has advanced) * 2) Sent skbs from output queue (and thus advancing tp->snd_nxt) * * We might be able to generate EPOLLOUT to the application if: * 1) Space consumed in output/rtx queues is below sk->sk_sndbuf/2 * 2) notsent amount (tp->write_seq - tp->snd_nxt) became * small enough that tcp_stream_memory_free() decides it * is time to generate EPOLLOUT. */ void tcp_check_space(struct sock *sk) { /* pairs with tcp_poll() */ smp_mb(); if (sk->sk_socket && test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { tcp_new_space(sk); if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } } static inline void tcp_data_snd_check(struct sock *sk) { tcp_push_pending_frames(sk); tcp_check_space(sk); } /* * Check if sending an ack is needed. */ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) { struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); unsigned long rtt; u64 delay; /* More than one full frame received... */ if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && /* ... and right edge of window advances far enough. * (tcp_recvmsg() will send ACK otherwise). * If application uses SO_RCVLOWAT, we want send ack now if * we have not received enough bytes to satisfy the condition. */ (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || __tcp_select_window(sk) >= tp->rcv_wnd)) || /* We ACK each frame or... */ tcp_in_quickack_mode(sk) || /* Protocol state mandates a one-time immediate ACK */ inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) { /* If we are running from __release_sock() in user context, * Defer the ack until tcp_release_cb(). */ if (sock_owned_by_user_nocheck(sk) && READ_ONCE(net->ipv4.sysctl_tcp_backlog_ack_defer)) { set_bit(TCP_ACK_DEFERRED, &sk->sk_tsq_flags); return; } send_now: tcp_send_ack(sk); return; } if (!ofo_possible || RB_EMPTY_ROOT(&tp->out_of_order_queue)) { tcp_send_delayed_ack(sk); return; } if (!tcp_is_sack(tp) || tp->compressed_ack >= READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_nr)) goto send_now; if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) { tp->compressed_ack_rcv_nxt = tp->rcv_nxt; tp->dup_ack_counter = 0; } if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) { tp->dup_ack_counter++; goto send_now; } tp->compressed_ack++; if (hrtimer_is_queued(&tp->compressed_ack_timer)) return; /* compress ack timer : comp_sack_rtt_percent of rtt, * but no more than tcp_comp_sack_delay_ns. */ rtt = tp->rcv_rtt_est.rtt_us; if (tp->srtt_us && tp->srtt_us < rtt) rtt = tp->srtt_us; /* delay = (rtt >> 3) * NSEC_PER_USEC * comp_sack_rtt_percent / 100 * -> * delay = rtt * 1.25 * comp_sack_rtt_percent */ delay = (u64)(rtt + (rtt >> 2)) * READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_rtt_percent); delay = min(delay, READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_delay_ns)); sock_hold(sk); hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay), READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_slack_ns), HRTIMER_MODE_REL_PINNED_SOFT); } static inline void tcp_ack_snd_check(struct sock *sk) { if (!inet_csk_ack_scheduled(sk)) { /* We sent a data segment already. */ return; } __tcp_ack_snd_check(sk, 1); } /* * This routine is only called when we have urgent data * signaled. Its the 'slow' part of tcp_urg. It could be * moved inline now as tcp_urg is only called from one * place. We handle URGent data wrong. We have to - as * BSD still doesn't use the correction from RFC961. * For 1003.1g we should support a new option TCP_STDURG to permit * either form (or just set the sysctl tcp_stdurg). */ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th) { struct tcp_sock *tp = tcp_sk(sk); u32 ptr = ntohs(th->urg_ptr); if (ptr && !READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_stdurg)) ptr--; ptr += ntohl(th->seq); /* Ignore urgent data that we've already seen and read. */ if (after(tp->copied_seq, ptr)) return; /* Do not replay urg ptr. * * NOTE: interesting situation not covered by specs. * Misbehaving sender may send urg ptr, pointing to segment, * which we already have in ofo queue. We are not able to fetch * such data and will stay in TCP_URG_NOTYET until will be eaten * by recvmsg(). Seems, we are not obliged to handle such wicked * situations. But it is worth to think about possibility of some * DoSes using some hypothetical application level deadlock. */ if (before(ptr, tp->rcv_nxt)) return; /* Do we already have a newer (or duplicate) urgent pointer? */ if (tp->urg_data && !after(ptr, tp->urg_seq)) return; /* Tell the world about our new urgent pointer. */ sk_send_sigurg(sk); /* We may be adding urgent data when the last byte read was * urgent. To do this requires some care. We cannot just ignore * tp->copied_seq since we would read the last urgent byte again * as data, nor can we alter copied_seq until this data arrives * or we break the semantics of SIOCATMARK (and thus sockatmark()) * * NOTE. Double Dutch. Rendering to plain English: author of comment * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB); * and expect that both A and B disappear from stream. This is _wrong_. * Though this happens in BSD with high probability, this is occasional. * Any application relying on this is buggy. Note also, that fix "works" * only in this artificial test. Insert some normal data between A and B and we will * decline of BSD again. Verdict: it is better to remove to trap * buggy users. */ if (tp->urg_seq == tp->copied_seq && tp->urg_data && !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) { struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); tp->copied_seq++; if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) { __skb_unlink(skb, &sk->sk_receive_queue); __kfree_skb(skb); } } WRITE_ONCE(tp->urg_data, TCP_URG_NOTYET); WRITE_ONCE(tp->urg_seq, ptr); /* Disable header prediction. */ tp->pred_flags = 0; } /* This is the 'fast' part of urgent handling. */ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th) { struct tcp_sock *tp = tcp_sk(sk); /* Check if we get a new urgent pointer - normally not. */ if (unlikely(th->urg)) tcp_check_urg(sk, th); /* Do we wait for any urgent data? - normally not... */ if (unlikely(tp->urg_data == TCP_URG_NOTYET)) { u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) - th->syn; /* Is the urgent pointer pointing into this packet? */ if (ptr < skb->len) { u8 tmp; if (skb_copy_bits(skb, ptr, &tmp, 1)) BUG(); WRITE_ONCE(tp->urg_data, TCP_URG_VALID | tmp); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); } } } /* Accept RST for rcv_nxt - 1 after a FIN. * When tcp connections are abruptly terminated from Mac OSX (via ^C), a * FIN is sent followed by a RST packet. The RST is sent with the same * sequence number as the FIN, and thus according to RFC 5961 a challenge * ACK should be sent. However, Mac OSX rate limits replies to challenge * ACKs on the closed socket. In addition middleboxes can drop either the * challenge ACK or a subsequent RST. */ static bool tcp_reset_check(const struct sock *sk, const struct sk_buff *skb) { const struct tcp_sock *tp = tcp_sk(sk); return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) && (1 << sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK | TCPF_CLOSING)); } /* Does PAWS and seqno based validation of an incoming segment, flags will * play significant role here. */ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, int syn_inerr) { struct tcp_sock *tp = tcp_sk(sk); bool accecn_reflector = false; SKB_DR(reason); /* RFC1323: H1. Apply PAWS check first. */ if (!tcp_fast_parse_options(sock_net(sk), skb, th, tp) || !tp->rx_opt.saw_tstamp || tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW)) goto step1; reason = tcp_disordered_ack_check(sk, skb); if (!reason) goto step1; /* Reset is accepted even if it did not pass PAWS. */ if (th->rst) goto step1; if (unlikely(th->syn)) goto syn_challenge; /* Old ACK are common, increment PAWS_OLD_ACK * and do not send a dupack. */ if (reason == SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK) { NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWS_OLD_ACK); goto discard; } NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); if (!tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDPAWS, &tp->last_oow_ack_time)) tcp_send_dupack(sk, skb); goto discard; step1: /* Step 1: check sequence number */ reason = tcp_sequence(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); if (reason) { /* RFC793, page 37: "In all states except SYN-SENT, all reset * (RST) segments are validated by checking their SEQ-fields." * And page 69: "If an incoming segment is not acceptable, * an acknowledgment should be sent in reply (unless the RST * bit is set, if so drop the segment and return)". */ if (!th->rst) { if (th->syn) goto syn_challenge; if (reason == SKB_DROP_REASON_TCP_INVALID_SEQUENCE || reason == SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE) NET_INC_STATS(sock_net(sk), LINUX_MIB_BEYOND_WINDOW); if (!tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSEQ, &tp->last_oow_ack_time)) tcp_send_dupack(sk, skb); } else if (tcp_reset_check(sk, skb)) { goto reset; } goto discard; } /* Step 2: check RST bit */ if (th->rst) { /* RFC 5961 3.2 (extend to match against (RCV.NXT - 1) after a * FIN and SACK too if available): * If seq num matches RCV.NXT or (RCV.NXT - 1) after a FIN, or * the right-most SACK block, * then * RESET the connection * else * Send a challenge ACK */ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt || tcp_reset_check(sk, skb)) goto reset; if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) { struct tcp_sack_block *sp = &tp->selective_acks[0]; int max_sack = sp[0].end_seq; int this_sack; for (this_sack = 1; this_sack < tp->rx_opt.num_sacks; ++this_sack) { max_sack = after(sp[this_sack].end_seq, max_sack) ? sp[this_sack].end_seq : max_sack; } if (TCP_SKB_CB(skb)->seq == max_sack) goto reset; } /* Disable TFO if RST is out-of-order * and no data has been received * for current active TFO socket */ if (tp->syn_fastopen && !tp->data_segs_in && sk->sk_state == TCP_ESTABLISHED) tcp_fastopen_active_disable(sk); tcp_send_challenge_ack(sk, false); SKB_DR_SET(reason, TCP_RESET); goto discard; } /* step 3: check security and precedence [ignored] */ /* step 4: Check for a SYN * RFC 5961 4.2 : Send a challenge ack */ if (th->syn) { if (tcp_ecn_mode_accecn(tp)) { accecn_reflector = true; if (tp->rx_opt.accecn && tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn); tcp_accecn_saw_opt_fail_recv(tp, saw_opt); tcp_accecn_opt_demand_min(sk, 1); } } if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack && TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq && TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt && TCP_SKB_CB(skb)->ack_seq == tp->snd_nxt) goto pass; syn_challenge: if (syn_inerr) TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); tcp_send_challenge_ack(sk, accecn_reflector); SKB_DR_SET(reason, TCP_INVALID_SYN); goto discard; } pass: bpf_skops_parse_hdr(sk, skb); return true; discard: tcp_drop_reason(sk, skb, reason); return false; reset: tcp_reset(sk, skb); __kfree_skb(skb); return false; } /* * TCP receive function for the ESTABLISHED state. * * It is split into a fast path and a slow path. The fast path is * disabled when: * - A zero window was announced from us - zero window probing * is only handled properly in the slow path. * - Out of order segments arrived. * - Urgent data is expected. * - There is no buffer space left * - Unexpected TCP flags/window values/header lengths are received * (detected by checking the TCP header against pred_flags) * - Data is sent in both directions. Fast path only supports pure senders * or pure receivers (this means either the sequence number or the ack * value must stay constant) * - Unexpected TCP option. * * When these conditions are not satisfied it drops into a standard * receive procedure patterned after RFC793 to handle all cases. * The first three cases are guaranteed by proper pred_flags setting, * the rest is checked inline. Fast processing is turned on in * tcp_data_queue when everything is OK. */ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; const struct tcphdr *th = (const struct tcphdr *)skb->data; struct tcp_sock *tp = tcp_sk(sk); unsigned int len = skb->len; /* TCP congestion window tracking */ trace_tcp_probe(sk, skb); tcp_mstamp_refresh(tp); if (unlikely(!rcu_access_pointer(sk->sk_rx_dst))) inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); /* * Header prediction. * The code loosely follows the one in the famous * "30 instruction TCP receive" Van Jacobson mail. * * Van's trick is to deposit buffers into socket queue * on a device interrupt, to call tcp_recv function * on the receive process context and checksum and copy * the buffer to user space. smart... * * Our current scheme is not silly either but we take the * extra cost of the net_bh soft interrupt processing... * We do checksum and copy also but from device to kernel. */ tp->rx_opt.saw_tstamp = 0; tp->rx_opt.accecn = 0; /* pred_flags is 0xS?10 << 16 + snd_wnd * if header_prediction is to be made * 'S' will always be tp->tcp_header_len >> 2 * '?' will be 0 for the fast path, otherwise pred_flags is 0 to * turn it off (when there are holes in the receive * space for instance) * PSH flag is ignored. */ if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags && TCP_SKB_CB(skb)->seq == tp->rcv_nxt && !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) { int tcp_header_len = tp->tcp_header_len; s32 delta = 0; int flag = 0; /* Timestamp header prediction: tcp_header_len * is automatically equal to th->doff*4 due to pred_flags * match. */ /* Check timestamp */ if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) { /* No? Slow path! */ if (!tcp_parse_aligned_timestamp(tp, th)) goto slow_path; delta = tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent; /* If PAWS failed, check it more carefully in slow path */ if (delta < 0) goto slow_path; /* DO NOT update ts_recent here, if checksum fails * and timestamp was corrupted part, it will result * in a hung connection since we will drop all * future packets due to the PAWS test. */ } if (len <= tcp_header_len) { /* Bulk data transfer: sender */ if (len == tcp_header_len) { /* Predicted packet is in window by definition. * seq == rcv_nxt and rcv_wup <= rcv_nxt. * Hence, check seq<=rcv_wup reduces to: */ if (tcp_header_len == (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && tp->rcv_nxt == tp->rcv_wup) flag |= __tcp_replace_ts_recent(tp, delta); tcp_ecn_received_counters(sk, skb, 0); /* We know that such packets are checksummed * on entry. */ tcp_ack(sk, skb, flag); __kfree_skb(skb); tcp_data_snd_check(sk); /* When receiving pure ack in fast path, update * last ts ecr directly instead of calling * tcp_rcv_rtt_measure_ts() */ tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr; return; } else { /* Header too small */ reason = SKB_DROP_REASON_PKT_TOO_SMALL; TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; } } else { int eaten = 0; bool fragstolen = false; if (tcp_checksum_complete(skb)) goto csum_error; if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt + tcp_receive_window(tp))) goto validate; if ((int)skb->truesize > sk->sk_forward_alloc) goto step5; /* Predicted packet is in window by definition. * seq == rcv_nxt and rcv_wup <= rcv_nxt. * Hence, check seq<=rcv_wup reduces to: */ if (tcp_header_len == (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && tp->rcv_nxt == tp->rcv_wup) flag |= __tcp_replace_ts_recent(tp, delta); tcp_rcv_rtt_measure_ts(sk, skb); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ tcp_cleanup_skb(skb); __skb_pull(skb, tcp_header_len); tcp_ecn_received_counters(sk, skb, len - tcp_header_len); eaten = tcp_queue_rcv(sk, skb, &fragstolen); tcp_event_data_recv(sk, skb); if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { /* Well, only one small jumplet in fast path... */ tcp_ack(sk, skb, flag | FLAG_DATA); tcp_data_snd_check(sk); if (!inet_csk_ack_scheduled(sk)) goto no_ack; } else { tcp_update_wl(tp, TCP_SKB_CB(skb)->seq); } __tcp_ack_snd_check(sk, 0); no_ack: if (eaten) kfree_skb_partial(skb, fragstolen); tcp_data_ready(sk); return; } } slow_path: if (len < (th->doff << 2) || tcp_checksum_complete(skb)) goto csum_error; if (!th->ack && !th->rst && !th->syn) { reason = SKB_DROP_REASON_TCP_FLAGS; goto discard; } /* * Standard slow path. */ validate: if (!tcp_validate_incoming(sk, skb, th, 1)) return; step5: tcp_ecn_received_counters_payload(sk, skb); reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT); if ((int)reason < 0) { reason = -reason; goto discard; } tcp_rcv_rtt_measure_ts(sk, skb); /* Process urgent data. */ tcp_urg(sk, skb, th); /* step 7: process the segment text */ tcp_data_queue(sk, skb); tcp_data_snd_check(sk); tcp_ack_snd_check(sk); return; csum_error: reason = SKB_DROP_REASON_TCP_CSUM; trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); discard: tcp_drop_reason(sk, skb, reason); } EXPORT_IPV6_MOD(tcp_rcv_established); void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); tcp_mtup_init(sk); icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); /* Initialize the congestion window to start the transfer. * Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been * retransmitted. In light of RFC6298 more aggressive 1sec * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK * retransmission has occurred. */ if (tp->total_retrans > 1 && tp->undo_marker) tcp_snd_cwnd_set(tp, 1); else tcp_snd_cwnd_set(tp, tcp_init_cwnd(tp, __sk_dst_get(sk))); tp->snd_cwnd_stamp = tcp_jiffies32; bpf_skops_established(sk, bpf_op, skb); /* Initialize congestion control unless BPF initialized it already: */ if (!icsk->icsk_ca_initialized) tcp_init_congestion_control(sk); tcp_init_buffer_space(sk); } void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); tcp_ao_finish_connect(sk, skb); tcp_set_state(sk, TCP_ESTABLISHED); icsk->icsk_ack.lrcvtime = tcp_jiffies32; if (skb) { icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); security_inet_conn_established(sk, skb); sk_mark_napi_id(sk, skb); } tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, skb); /* Prevent spurious tcp_cwnd_restart() on first data * packet. */ tp->lsndtime = tcp_jiffies32; if (sock_flag(sk, SOCK_KEEPOPEN)) tcp_reset_keepalive_timer(sk, keepalive_time_when(tp)); if (!tp->rx_opt.snd_wscale) __tcp_fast_path_on(tp, tp->snd_wnd); else tp->pred_flags = 0; } static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, struct tcp_fastopen_cookie *cookie) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL; u16 mss = tp->rx_opt.mss_clamp, try_exp = 0; bool syn_drop = false; if (mss == READ_ONCE(tp->rx_opt.user_mss)) { struct tcp_options_received opt; /* Get original SYNACK MSS value if user MSS sets mss_clamp */ tcp_clear_options(&opt); opt.user_mss = opt.mss_clamp = 0; tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL); mss = opt.mss_clamp; } if (!tp->syn_fastopen) { /* Ignore an unsolicited cookie */ cookie->len = -1; } else if (tp->total_retrans) { /* SYN timed out and the SYN-ACK neither has a cookie nor * acknowledges data. Presumably the remote received only * the retransmitted (regular) SYNs: either the original * SYN-data or the corresponding SYN-ACK was dropped. */ syn_drop = (cookie->len < 0 && data); } else if (cookie->len < 0 && !tp->syn_data) { /* We requested a cookie but didn't get it. If we did not use * the (old) exp opt format then try so next time (try_exp=1). * Otherwise we go back to use the RFC7413 opt (try_exp=2). */ try_exp = tp->syn_fastopen_exp ? 2 : 1; } tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp); if (data) { /* Retransmit unacked data in SYN */ if (tp->total_retrans) tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED; else tp->fastopen_client_fail = TFO_DATA_NOT_ACKED; skb_rbtree_walk_from(data) tcp_mark_skb_lost(sk, data); tcp_non_congestion_loss_retransmit(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); return true; } tp->syn_data_acked = tp->syn_data; if (tp->syn_data_acked) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); /* SYN-data is counted as two separate packets in tcp_ack() */ if (tp->delivered > 1) --tp->delivered; } tcp_fastopen_add_skb(sk, synack); return false; } static void smc_check_reset_syn(struct tcp_sock *tp) { #if IS_ENABLED(CONFIG_SMC) if (static_branch_unlikely(&tcp_have_smc)) { if (tp->syn_smc && !tp->rx_opt.smc_ok) tp->syn_smc = 0; } #endif } static void tcp_try_undo_spurious_syn(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); u32 syn_stamp; /* undo_marker is set when SYN or SYNACK times out. The timeout is * spurious if the ACK's timestamp option echo value matches the * original SYN timestamp. */ syn_stamp = tp->retrans_stamp; if (tp->undo_marker && syn_stamp && tp->rx_opt.saw_tstamp && syn_stamp == tp->rx_opt.rcv_tsecr) tp->undo_marker = 0; } static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct tcp_fastopen_cookie foc = { .len = -1 }; int saved_clamp = tp->rx_opt.mss_clamp; bool fastopen_fail; SKB_DR(reason); tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc); if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) tp->rx_opt.rcv_tsecr -= tp->tsoffset; if (th->ack) { /* rfc793: * "If the state is SYN-SENT then * first check the ACK bit * If the ACK bit is set * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send * a reset (unless the RST bit is set, if so drop * the segment and return)" */ if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) || after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) { /* Previous FIN/ACK or RST/ACK might be ignored. */ if (icsk->icsk_retransmits == 0) tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, TCP_TIMEOUT_MIN, false); SKB_DR_SET(reason, TCP_INVALID_ACK_SEQUENCE); goto reset_and_undo; } if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, tcp_time_stamp_ts(tp))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED); SKB_DR_SET(reason, TCP_RFC7323_PAWS); goto reset_and_undo; } /* Now ACK is acceptable. * * "If the RST bit is set * If the ACK was acceptable then signal the user "error: * connection reset", drop the segment, enter CLOSED state, * delete TCB, and return." */ if (th->rst) { tcp_reset(sk, skb); consume: __kfree_skb(skb); return 0; } /* rfc793: * "fifth, if neither of the SYN or RST bits is set then * drop the segment and return." * * See note below! * --ANK(990513) */ if (!th->syn) { SKB_DR_SET(reason, TCP_FLAGS); goto discard_and_undo; } /* rfc793: * "If the SYN bit is on ... * are acceptable then ... * (our SYN has been ACKed), change the connection * state to ESTABLISHED..." */ if (tcp_ecn_mode_any(tp)) tcp_ecn_rcv_synack(sk, skb, th, TCP_SKB_CB(skb)->ip_dsfield); tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); tcp_try_undo_spurious_syn(sk); tcp_ack(sk, skb, FLAG_SLOWPATH); /* Ok.. it's good. Set up sequence numbers and * move to established. */ WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1); tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; /* RFC1323: The window in SYN & SYN/ACK segments is * never scaled. */ tp->snd_wnd = ntohs(th->window); if (!tp->rx_opt.wscale_ok) { tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0; WRITE_ONCE(tp->window_clamp, min(tp->window_clamp, 65535U)); } if (tp->rx_opt.saw_tstamp) { tp->rx_opt.tstamp_ok = 1; tp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; tcp_store_ts_recent(tp); } else { tp->tcp_header_len = sizeof(struct tcphdr); } tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); /* Remember, tcp_poll() does not lock socket! * Change state from SYN-SENT only after copied_seq * is initialized. */ WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); smc_check_reset_syn(tp); smp_mb(); tcp_finish_connect(sk, skb); fastopen_fail = (tp->syn_fastopen || tp->syn_data) && tcp_rcv_fastopen_synack(sk, skb, &foc); if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); } if (fastopen_fail) return -1; if (sk->sk_write_pending || READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept) || inet_csk_in_pingpong_mode(sk)) { /* Save one ACK. Data will be ready after * several ticks, if write_pending is set. * * It may be deleted, but with this feature tcpdumps * look so _wonderfully_ clever, that I was not able * to stand against the temptation 8) --ANK */ inet_csk_schedule_ack(sk); tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); tcp_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX, false); goto consume; } tcp_send_ack_reflect_ect(sk, tcp_ecn_mode_accecn(tp)); return -1; } /* No ACK in the segment */ if (th->rst) { /* rfc793: * "If the RST bit is set * * Otherwise (no ACK) drop the segment and return." */ SKB_DR_SET(reason, TCP_RESET); goto discard_and_undo; } /* PAWS check. */ if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp && tcp_paws_reject(&tp->rx_opt, 0)) { SKB_DR_SET(reason, TCP_RFC7323_PAWS); goto discard_and_undo; } if (th->syn) { /* We see SYN without ACK. It is attempt of * simultaneous connect with crossed SYNs. * Particularly, it can be connect to self. */ #ifdef CONFIG_TCP_AO struct tcp_ao_info *ao; ao = rcu_dereference_protected(tp->ao_info, lockdep_sock_is_held(sk)); if (ao) { WRITE_ONCE(ao->risn, th->seq); ao->rcv_sne = 0; } #endif tcp_set_state(sk, TCP_SYN_RECV); if (tp->rx_opt.saw_tstamp) { tp->rx_opt.tstamp_ok = 1; tcp_store_ts_recent(tp); tp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; } else { tp->tcp_header_len = sizeof(struct tcphdr); } WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1); WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; /* RFC1323: The window in SYN & SYN/ACK segments is * never scaled. */ tp->snd_wnd = ntohs(th->window); tp->snd_wl1 = TCP_SKB_CB(skb)->seq; tp->max_window = tp->snd_wnd; tcp_ecn_rcv_syn(tp, th, skb); tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); tcp_send_synack(sk); #if 0 /* Note, we could accept data and URG from this segment. * There are no obstacles to make this (except that we must * either change tcp_recvmsg() to prevent it from returning data * before 3WHS completes per RFC793, or employ TCP Fast Open). * * However, if we ignore data in ACKless segments sometimes, * we have no reasons to accept it sometimes. * Also, seems the code doing it in step6 of tcp_rcv_state_process * is not flawless. So, discard packet for sanity. * Uncomment this return to process the data. */ return -1; #else goto consume; #endif } /* "fifth, if neither of the SYN or RST bits is set then * drop the segment and return." */ discard_and_undo: tcp_clear_options(&tp->rx_opt); tp->rx_opt.mss_clamp = saved_clamp; tcp_drop_reason(sk, skb, reason); return 0; reset_and_undo: tcp_clear_options(&tp->rx_opt); tp->rx_opt.mss_clamp = saved_clamp; /* we can reuse/return @reason to its caller to handle the exception */ return reason; } static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct request_sock *req; /* If we are still handling the SYNACK RTO, see if timestamp ECR allows * undo. If peer SACKs triggered fast recovery, we can't undo here. */ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss && !tp->packets_out) tcp_try_undo_recovery(sk); tcp_update_rto_time(tp); WRITE_ONCE(inet_csk(sk)->icsk_retransmits, 0); /* In tcp_fastopen_synack_timer() on the first SYNACK RTO we set * retrans_stamp but don't enter CA_Loss, so in case that happened we * need to zero retrans_stamp here to prevent spurious * retransmits_timed_out(). However, if the ACK of our SYNACK caused us * to enter CA_Recovery then we need to leave retrans_stamp as it was * set entering CA_Recovery, for correct retransmits_timed_out() and * undo behavior. */ tcp_retrans_stamp_cleanup(sk); /* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1, * we no longer need req so release it. */ req = rcu_dereference_protected(tp->fastopen_rsk, lockdep_sock_is_held(sk)); reqsk_fastopen_remove(sk, req, false); /* Re-arm the timer because data may have been sent out. * This is similar to the regular data transmission case * when new data has just been ack'ed. * * (TFO) - we could try to be more aggressive and * retransmitting any data sooner based on when they * are sent out. */ tcp_rearm_rto(sk); } /* * This function implements the receiving procedure of RFC 793 for * all states except ESTABLISHED and TIME_WAIT. * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be * address independent. */ enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); const struct tcphdr *th = tcp_hdr(skb); struct request_sock *req; int queued = 0; SKB_DR(reason); switch (sk->sk_state) { case TCP_CLOSE: SKB_DR_SET(reason, TCP_CLOSE); goto discard; case TCP_LISTEN: if (th->ack) return SKB_DROP_REASON_TCP_FLAGS; if (th->rst) { SKB_DR_SET(reason, TCP_RESET); goto discard; } if (th->syn) { if (th->fin) { SKB_DR_SET(reason, TCP_FLAGS); goto discard; } /* It is possible that we process SYN packets from backlog, * so we need to make sure to disable BH and RCU right there. */ rcu_read_lock(); local_bh_disable(); icsk->icsk_af_ops->conn_request(sk, skb); local_bh_enable(); rcu_read_unlock(); consume_skb(skb); return 0; } SKB_DR_SET(reason, TCP_FLAGS); goto discard; case TCP_SYN_SENT: tp->rx_opt.saw_tstamp = 0; tcp_mstamp_refresh(tp); queued = tcp_rcv_synsent_state_process(sk, skb, th); if (queued >= 0) return queued; /* Do step6 onward by hand. */ tcp_urg(sk, skb, th); __kfree_skb(skb); tcp_data_snd_check(sk); return 0; } tcp_mstamp_refresh(tp); tp->rx_opt.saw_tstamp = 0; req = rcu_dereference_protected(tp->fastopen_rsk, lockdep_sock_is_held(sk)); if (req) { bool req_stolen; WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && sk->sk_state != TCP_FIN_WAIT1); SKB_DR_SET(reason, TCP_FASTOPEN); if (!tcp_check_req(sk, skb, req, true, &req_stolen, &reason)) goto discard; } if (!th->ack && !th->rst && !th->syn) { SKB_DR_SET(reason, TCP_FLAGS); goto discard; } if (!tcp_validate_incoming(sk, skb, th, 0)) return 0; /* step 5: check the ACK field */ reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT | FLAG_NO_CHALLENGE_ACK); if ((int)reason <= 0) { if (sk->sk_state == TCP_SYN_RECV) { /* send one RST */ if (!reason) return SKB_DROP_REASON_TCP_OLD_ACK; return -reason; } /* accept old ack during closing */ if ((int)reason < 0) { tcp_send_challenge_ack(sk, false); reason = -reason; goto discard; } } SKB_DR_SET(reason, NOT_SPECIFIED); switch (sk->sk_state) { case TCP_SYN_RECV: tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */ if (!tp->srtt_us) tcp_synack_rtt_meas(sk, req); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; if (req) { tcp_rcv_synrecv_state_fastopen(sk); } else { tcp_try_undo_spurious_syn(sk); tp->retrans_stamp = 0; tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, skb); WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); } tcp_ao_established(sk); smp_mb(); tcp_set_state(sk, TCP_ESTABLISHED); sk->sk_state_change(sk); /* Note, that this wakeup is only for marginal crossed SYN case. * Passively open sockets are not waked up, because * sk->sk_sleep == NULL and sk->sk_socket == NULL. */ if (sk->sk_socket) sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); tp->snd_una = TCP_SKB_CB(skb)->ack_seq; tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); if (!inet_csk(sk)->icsk_ca_ops->cong_control) tcp_update_pacing_rate(sk); /* Prevent spurious tcp_cwnd_restart() on first data packet */ tp->lsndtime = tcp_jiffies32; tcp_initialize_rcv_mss(sk); if (tcp_ecn_mode_accecn(tp)) tcp_accecn_third_ack(sk, skb, tp->syn_ect_snt); tcp_fast_path_on(tp); if (sk->sk_shutdown & SEND_SHUTDOWN) tcp_shutdown(sk, SEND_SHUTDOWN); break; case TCP_FIN_WAIT1: { int tmo; if (req) tcp_rcv_synrecv_state_fastopen(sk); if (tp->snd_una != tp->write_seq) break; tcp_set_state(sk, TCP_FIN_WAIT2); WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | SEND_SHUTDOWN); sk_dst_confirm(sk); if (!sock_flag(sk, SOCK_DEAD)) { /* Wake up lingering close() */ sk->sk_state_change(sk); break; } if (READ_ONCE(tp->linger2) < 0) { tcp_done(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); return SKB_DROP_REASON_TCP_ABORT_ON_DATA; } if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { /* Receive out of order FIN after close() */ if (tp->syn_fastopen && th->fin) tcp_fastopen_active_disable(sk); tcp_done(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); return SKB_DROP_REASON_TCP_ABORT_ON_DATA; } tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); } else if (th->fin || sock_owned_by_user(sk)) { /* Bad case. We could lose such FIN otherwise. * It is not a big problem, but it looks confusing * and not so rare event. We still can lose it now, * if it spins in bh_lock_sock(), but it is really * marginal case. */ tcp_reset_keepalive_timer(sk, tmo); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto consume; } break; } case TCP_CLOSING: if (tp->snd_una == tp->write_seq) { tcp_time_wait(sk, TCP_TIME_WAIT, 0); goto consume; } break; case TCP_LAST_ACK: if (tp->snd_una == tp->write_seq) { tcp_update_metrics(sk); tcp_done(sk); goto consume; } break; } /* step 6: check the URG bit */ tcp_urg(sk, skb, th); /* step 7: process the segment text */ switch (sk->sk_state) { case TCP_CLOSE_WAIT: case TCP_CLOSING: case TCP_LAST_ACK: if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { /* If a subflow has been reset, the packet should not * continue to be processed, drop the packet. */ if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb)) goto discard; break; } fallthrough; case TCP_FIN_WAIT1: case TCP_FIN_WAIT2: /* RFC 793 says to queue data in these states, * RFC 1122 says we MUST send a reset. * BSD 4.4 also does reset. */ if (sk->sk_shutdown & RCV_SHUTDOWN) { if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); tcp_reset(sk, skb); return SKB_DROP_REASON_TCP_ABORT_ON_DATA; } } fallthrough; case TCP_ESTABLISHED: tcp_data_queue(sk, skb); queued = 1; break; } /* tcp_data could move socket to TIME-WAIT */ if (sk->sk_state != TCP_CLOSE) { tcp_data_snd_check(sk); tcp_ack_snd_check(sk); } if (!queued) { discard: tcp_drop_reason(sk, skb, reason); } return 0; consume: __kfree_skb(skb); return 0; } EXPORT_IPV6_MOD(tcp_rcv_state_process); static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) { struct inet_request_sock *ireq = inet_rsk(req); if (family == AF_INET) net_dbg_ratelimited("drop open request from %pI4/%u\n", &ireq->ir_rmt_addr, port); #if IS_ENABLED(CONFIG_IPV6) else if (family == AF_INET6) net_dbg_ratelimited("drop open request from %pI6/%u\n", &ireq->ir_v6_rmt_addr, port); #endif } /* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set * * If we receive a SYN packet with these bits set, it means a * network is playing bad games with TOS bits. In order to * avoid possible false congestion notifications, we disable * TCP ECN negotiation. * * Exception: tcp_ca wants ECN. This is required for DCTCP * congestion control: Linux DCTCP asserts ECT on all packets, * including SYN, which is most optimal solution; however, * others, such as FreeBSD do not. * * Exception: At least one of the reserved bits of the TCP header (th->res1) is * set, indicating the use of a future TCP extension (such as AccECN). See * RFC8311 §4.3 which updates RFC3168 to allow the development of such * extensions. */ static void tcp_ecn_create_request(struct request_sock *req, const struct sk_buff *skb, const struct sock *listen_sk, const struct dst_entry *dst) { const struct tcphdr *th = tcp_hdr(skb); const struct net *net = sock_net(listen_sk); bool th_ecn = th->ece && th->cwr; bool ect, ecn_ok; u32 ecn_ok_dst; if (tcp_accecn_syn_requested(th) && READ_ONCE(net->ipv4.sysctl_tcp_ecn) >= 3) { inet_rsk(req)->ecn_ok = 1; tcp_rsk(req)->accecn_ok = 1; tcp_rsk(req)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; return; } if (!th_ecn) return; ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK); ecn_ok = READ_ONCE(net->ipv4.sysctl_tcp_ecn) || ecn_ok_dst; if (((!ect || th->res1 || th->ae) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || (ecn_ok_dst & DST_FEATURE_ECN_CA) || tcp_bpf_ca_needs_ecn((struct sock *)req)) inet_rsk(req)->ecn_ok = 1; } static void tcp_openreq_init(struct request_sock *req, const struct tcp_options_received *rx_opt, struct sk_buff *skb, const struct sock *sk) { struct inet_request_sock *ireq = inet_rsk(req); req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */ tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; tcp_rsk(req)->snt_synack = 0; tcp_rsk(req)->snt_tsval_first = 0; tcp_rsk(req)->last_oow_ack_time = 0; tcp_rsk(req)->accecn_ok = 0; tcp_rsk(req)->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN; tcp_rsk(req)->accecn_fail_mode = 0; tcp_rsk(req)->syn_ect_rcv = 0; tcp_rsk(req)->syn_ect_snt = 0; req->mss = rx_opt->mss_clamp; req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; ireq->tstamp_ok = rx_opt->tstamp_ok; ireq->sack_ok = rx_opt->sack_ok; ireq->snd_wscale = rx_opt->snd_wscale; ireq->wscale_ok = rx_opt->wscale_ok; ireq->acked = 0; ireq->ecn_ok = 0; ireq->ir_rmt_port = tcp_hdr(skb)->source; ireq->ir_num = ntohs(tcp_hdr(skb)->dest); ireq->ir_mark = inet_request_mark(sk, skb); #if IS_ENABLED(CONFIG_SMC) ireq->smc_ok = rx_opt->smc_ok && !(tcp_sk(sk)->smc_hs_congested && tcp_sk(sk)->smc_hs_congested(sk)); #endif } /* * Return true if a syncookie should be sent */ static bool tcp_syn_flood_action(struct sock *sk, const char *proto) { struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; const char *msg = "Dropping request"; struct net *net = sock_net(sk); bool want_cookie = false; u8 syncookies; syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); #ifdef CONFIG_SYN_COOKIES if (syncookies) { msg = "Sending cookies"; want_cookie = true; __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); } else #endif __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); if (syncookies != 2 && !READ_ONCE(queue->synflood_warned)) { WRITE_ONCE(queue->synflood_warned, 1); if (IS_ENABLED(CONFIG_IPV6) && sk->sk_family == AF_INET6) { net_info_ratelimited("%s: Possible SYN flooding on port [%pI6c]:%u. %s.\n", proto, inet6_rcv_saddr(sk), sk->sk_num, msg); } else { net_info_ratelimited("%s: Possible SYN flooding on port %pI4:%u. %s.\n", proto, &sk->sk_rcv_saddr, sk->sk_num, msg); } } return want_cookie; } static void tcp_reqsk_record_syn(const struct sock *sk, struct request_sock *req, const struct sk_buff *skb) { if (tcp_sk(sk)->save_syn) { u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb); struct saved_syn *saved_syn; u32 mac_hdrlen; void *base; if (tcp_sk(sk)->save_syn == 2) { /* Save full header. */ base = skb_mac_header(skb); mac_hdrlen = skb_mac_header_len(skb); len += mac_hdrlen; } else { base = skb_network_header(skb); mac_hdrlen = 0; } saved_syn = kmalloc(struct_size(saved_syn, data, len), GFP_ATOMIC); if (saved_syn) { saved_syn->mac_hdrlen = mac_hdrlen; saved_syn->network_hdrlen = skb_network_header_len(skb); saved_syn->tcp_hdrlen = tcp_hdrlen(skb); memcpy(saved_syn->data, base, len); req->saved_syn = saved_syn; } } } /* If a SYN cookie is required and supported, returns a clamped MSS value to be * used for SYN cookie generation. */ u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct tcphdr *th) { struct tcp_sock *tp = tcp_sk(sk); u16 mss; if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) != 2 && !inet_csk_reqsk_queue_is_full(sk)) return 0; if (!tcp_syn_flood_action(sk, rsk_ops->slab_name)) return 0; if (sk_acceptq_is_full(sk)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); return 0; } mss = tcp_parse_mss_option(th, READ_ONCE(tp->rx_opt.user_mss)); if (!mss) mss = af_ops->mss_clamp; return mss; } EXPORT_IPV6_MOD_GPL(tcp_get_syncookie_mss); int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb) { struct tcp_fastopen_cookie foc = { .len = -1 }; struct tcp_options_received tmp_opt; const struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct sock *fastopen_sk = NULL; struct request_sock *req; bool want_cookie = false; struct dst_entry *dst; struct flowi fl; u8 syncookies; u32 isn; #ifdef CONFIG_TCP_AO const struct tcp_ao_hdr *aoh; #endif isn = __this_cpu_read(tcp_tw_isn); if (isn) { /* TW buckets are converted to open requests without * limitations, they conserve resources and peer is * evidently real one. */ __this_cpu_write(tcp_tw_isn, 0); } else { syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); if (syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) { want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name); if (!want_cookie) goto drop; } } if (sk_acceptq_is_full(sk)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); goto drop; } req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie); if (!req) goto drop; req->syncookie = want_cookie; tcp_rsk(req)->af_specific = af_ops; tcp_rsk(req)->ts_off = 0; tcp_rsk(req)->req_usec_ts = false; #if IS_ENABLED(CONFIG_MPTCP) tcp_rsk(req)->is_mptcp = 0; #endif tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = af_ops->mss_clamp; tmp_opt.user_mss = READ_ONCE(tp->rx_opt.user_mss); tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, want_cookie ? NULL : &foc); if (want_cookie && !tmp_opt.saw_tstamp) tcp_clear_options(&tmp_opt); if (IS_ENABLED(CONFIG_SMC) && want_cookie) tmp_opt.smc_ok = 0; tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; tcp_openreq_init(req, &tmp_opt, skb, sk); inet_rsk(req)->no_srccheck = inet_test_bit(TRANSPARENT, sk); /* Note: tcp_v6_init_req() might override ir_iif for link locals */ inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb); dst = af_ops->route_req(sk, skb, &fl, req, isn); if (!dst) goto drop_and_free; if (tmp_opt.tstamp_ok) { tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst); tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb); } if (!want_cookie && !isn) { int max_syn_backlog = READ_ONCE(net->ipv4.sysctl_max_syn_backlog); /* Kill the following clause, if you dislike this way. */ if (!syncookies && (max_syn_backlog - inet_csk_reqsk_queue_len(sk) < (max_syn_backlog >> 2)) && !tcp_peer_is_proven(req, dst)) { /* Without syncookies last quarter of * backlog is filled with destinations, * proven to be alive. * It means that we continue to communicate * to destinations, already remembered * to the moment of synflood. */ pr_drop_req(req, ntohs(tcp_hdr(skb)->source), rsk_ops->family); goto drop_and_release; } isn = af_ops->init_seq(skb); } tcp_ecn_create_request(req, skb, sk, dst); if (want_cookie) { isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); if (!tmp_opt.tstamp_ok) inet_rsk(req)->ecn_ok = 0; } #ifdef CONFIG_TCP_AO if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) goto drop_and_release; /* Invalid TCP options */ if (aoh) { tcp_rsk(req)->used_tcp_ao = true; tcp_rsk(req)->ao_rcv_next = aoh->keyid; tcp_rsk(req)->ao_keyid = aoh->rnext_keyid; } else { tcp_rsk(req)->used_tcp_ao = false; } #endif tcp_rsk(req)->snt_isn = isn; tcp_rsk(req)->txhash = net_tx_rndhash(); tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; tcp_openreq_init_rwin(req, sk, dst); sk_rx_queue_set(req_to_sk(req), skb); if (!want_cookie) { tcp_reqsk_record_syn(sk, req, skb); fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); } if (fastopen_sk) { af_ops->send_synack(fastopen_sk, dst, &fl, req, &foc, TCP_SYNACK_FASTOPEN, skb); /* Add the child socket directly into the accept queue */ if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) { bh_unlock_sock(fastopen_sk); sock_put(fastopen_sk); goto drop_and_free; } sk->sk_data_ready(sk); bh_unlock_sock(fastopen_sk); sock_put(fastopen_sk); } else { tcp_rsk(req)->tfo_listener = false; if (!want_cookie && unlikely(!inet_csk_reqsk_queue_hash_add(sk, req))) { reqsk_free(req); dst_release(dst); return 0; } af_ops->send_synack(sk, dst, &fl, req, &foc, !want_cookie ? TCP_SYNACK_NORMAL : TCP_SYNACK_COOKIE, skb); if (want_cookie) { reqsk_free(req); return 0; } } reqsk_put(req); return 0; drop_and_release: dst_release(dst); drop_and_free: __reqsk_free(req); drop: tcp_listendrop(sk); return 0; } EXPORT_IPV6_MOD(tcp_conn_request);
7 7 7 7 7 7 7 7 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 // SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/fs.h> #include <linux/sort.h> #include <linux/slab.h> #include <linux/iversion.h> #include "super.h" #include "mds_client.h" #include <linux/ceph/decode.h> /* unused map expires after 5 minutes */ #define CEPH_SNAPID_MAP_TIMEOUT (5 * 60 * HZ) /* * Snapshots in ceph are driven in large part by cooperation from the * client. In contrast to local file systems or file servers that * implement snapshots at a single point in the system, ceph's * distributed access to storage requires clients to help decide * whether a write logically occurs before or after a recently created * snapshot. * * This provides a perfect instantanous client-wide snapshot. Between * clients, however, snapshots may appear to be applied at slightly * different points in time, depending on delays in delivering the * snapshot notification. * * Snapshots are _not_ file system-wide. Instead, each snapshot * applies to the subdirectory nested beneath some directory. This * effectively divides the hierarchy into multiple "realms," where all * of the files contained by each realm share the same set of * snapshots. An individual realm's snap set contains snapshots * explicitly created on that realm, as well as any snaps in its * parent's snap set _after_ the point at which the parent became it's * parent (due to, say, a rename). Similarly, snaps from prior parents * during the time intervals during which they were the parent are included. * * The client is spared most of this detail, fortunately... it must only * maintains a hierarchy of realms reflecting the current parent/child * realm relationship, and for each realm has an explicit list of snaps * inherited from prior parents. * * A snap_realm struct is maintained for realms containing every inode * with an open cap in the system. (The needed snap realm information is * provided by the MDS whenever a cap is issued, i.e., on open.) A 'seq' * version number is used to ensure that as realm parameters change (new * snapshot, new parent, etc.) the client's realm hierarchy is updated. * * The realm hierarchy drives the generation of a 'snap context' for each * realm, which simply lists the resulting set of snaps for the realm. This * is attached to any writes sent to OSDs. */ /* * Unfortunately error handling is a bit mixed here. If we get a snap * update, but don't have enough memory to update our realm hierarchy, * it's not clear what we can do about it (besides complaining to the * console). */ /* * increase ref count for the realm * * caller must hold snap_rwsem. */ void ceph_get_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm) { lockdep_assert_held(&mdsc->snap_rwsem); /* * The 0->1 and 1->0 transitions must take the snap_empty_lock * atomically with the refcount change. Go ahead and bump the * nref here, unless it's 0, in which case we take the spinlock * and then do the increment and remove it from the list. */ if (atomic_inc_not_zero(&realm->nref)) return; spin_lock(&mdsc->snap_empty_lock); if (atomic_inc_return(&realm->nref) == 1) list_del_init(&realm->empty_item); spin_unlock(&mdsc->snap_empty_lock); } static void __insert_snap_realm(struct rb_root *root, struct ceph_snap_realm *new) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; struct ceph_snap_realm *r = NULL; while (*p) { parent = *p; r = rb_entry(parent, struct ceph_snap_realm, node); if (new->ino < r->ino) p = &(*p)->rb_left; else if (new->ino > r->ino) p = &(*p)->rb_right; else BUG(); } rb_link_node(&new->node, parent, p); rb_insert_color(&new->node, root); } /* * create and get the realm rooted at @ino and bump its ref count. * * caller must hold snap_rwsem for write. */ static struct ceph_snap_realm *ceph_create_snap_realm( struct ceph_mds_client *mdsc, u64 ino) { struct ceph_snap_realm *realm; lockdep_assert_held_write(&mdsc->snap_rwsem); realm = kzalloc(sizeof(*realm), GFP_NOFS); if (!realm) return ERR_PTR(-ENOMEM); /* Do not release the global dummy snaprealm until unmouting */ if (ino == CEPH_INO_GLOBAL_SNAPREALM) atomic_set(&realm->nref, 2); else atomic_set(&realm->nref, 1); realm->ino = ino; INIT_LIST_HEAD(&realm->children); INIT_LIST_HEAD(&realm->child_item); INIT_LIST_HEAD(&realm->empty_item); INIT_LIST_HEAD(&realm->dirty_item); INIT_LIST_HEAD(&realm->rebuild_item); INIT_LIST_HEAD(&realm->inodes_with_caps); spin_lock_init(&realm->inodes_with_caps_lock); __insert_snap_realm(&mdsc->snap_realms, realm); mdsc->num_snap_realms++; doutc(mdsc->fsc->client, "%llx %p\n", realm->ino, realm); return realm; } /* * lookup the realm rooted at @ino. * * caller must hold snap_rwsem. */ static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc, u64 ino) { struct ceph_client *cl = mdsc->fsc->client; struct rb_node *n = mdsc->snap_realms.rb_node; struct ceph_snap_realm *r; lockdep_assert_held(&mdsc->snap_rwsem); while (n) { r = rb_entry(n, struct ceph_snap_realm, node); if (ino < r->ino) n = n->rb_left; else if (ino > r->ino) n = n->rb_right; else { doutc(cl, "%llx %p\n", r->ino, r); return r; } } return NULL; } struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, u64 ino) { struct ceph_snap_realm *r; r = __lookup_snap_realm(mdsc, ino); if (r) ceph_get_snap_realm(mdsc, r); return r; } static void __put_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm); /* * called with snap_rwsem (write) */ static void __destroy_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm) { struct ceph_client *cl = mdsc->fsc->client; lockdep_assert_held_write(&mdsc->snap_rwsem); doutc(cl, "%p %llx\n", realm, realm->ino); rb_erase(&realm->node, &mdsc->snap_realms); mdsc->num_snap_realms--; if (realm->parent) { list_del_init(&realm->child_item); __put_snap_realm(mdsc, realm->parent); } kfree(realm->prior_parent_snaps); kfree(realm->snaps); ceph_put_snap_context(realm->cached_context); kfree(realm); } /* * caller holds snap_rwsem (write) */ static void __put_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm) { lockdep_assert_held_write(&mdsc->snap_rwsem); /* * We do not require the snap_empty_lock here, as any caller that * increments the value must hold the snap_rwsem. */ if (atomic_dec_and_test(&realm->nref)) __destroy_snap_realm(mdsc, realm); } /* * See comments in ceph_get_snap_realm. Caller needn't hold any locks. */ void ceph_put_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm) { if (!atomic_dec_and_lock(&realm->nref, &mdsc->snap_empty_lock)) return; if (down_write_trylock(&mdsc->snap_rwsem)) { spin_unlock(&mdsc->snap_empty_lock); __destroy_snap_realm(mdsc, realm); up_write(&mdsc->snap_rwsem); } else { list_add(&realm->empty_item, &mdsc->snap_empty); spin_unlock(&mdsc->snap_empty_lock); } } /* * Clean up any realms whose ref counts have dropped to zero. Note * that this does not include realms who were created but not yet * used. * * Called under snap_rwsem (write) */ static void __cleanup_empty_realms(struct ceph_mds_client *mdsc) { struct ceph_snap_realm *realm; lockdep_assert_held_write(&mdsc->snap_rwsem); spin_lock(&mdsc->snap_empty_lock); while (!list_empty(&mdsc->snap_empty)) { realm = list_first_entry(&mdsc->snap_empty, struct ceph_snap_realm, empty_item); list_del(&realm->empty_item); spin_unlock(&mdsc->snap_empty_lock); __destroy_snap_realm(mdsc, realm); spin_lock(&mdsc->snap_empty_lock); } spin_unlock(&mdsc->snap_empty_lock); } void ceph_cleanup_global_and_empty_realms(struct ceph_mds_client *mdsc) { struct ceph_snap_realm *global_realm; down_write(&mdsc->snap_rwsem); global_realm = __lookup_snap_realm(mdsc, CEPH_INO_GLOBAL_SNAPREALM); if (global_realm) ceph_put_snap_realm(mdsc, global_realm); __cleanup_empty_realms(mdsc); up_write(&mdsc->snap_rwsem); } /* * adjust the parent realm of a given @realm. adjust child list, and parent * pointers, and ref counts appropriately. * * return true if parent was changed, 0 if unchanged, <0 on error. * * caller must hold snap_rwsem for write. */ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm, u64 parentino) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_snap_realm *parent; lockdep_assert_held_write(&mdsc->snap_rwsem); if (realm->parent_ino == parentino) return 0; parent = ceph_lookup_snap_realm(mdsc, parentino); if (!parent) { parent = ceph_create_snap_realm(mdsc, parentino); if (IS_ERR(parent)) return PTR_ERR(parent); } doutc(cl, "%llx %p: %llx %p -> %llx %p\n", realm->ino, realm, realm->parent_ino, realm->parent, parentino, parent); if (realm->parent) { list_del_init(&realm->child_item); ceph_put_snap_realm(mdsc, realm->parent); } realm->parent_ino = parentino; realm->parent = parent; list_add(&realm->child_item, &parent->children); return 1; } static int cmpu64_rev(const void *a, const void *b) { if (*(u64 *)a < *(u64 *)b) return 1; if (*(u64 *)a > *(u64 *)b) return -1; return 0; } /* * build the snap context for a given realm. */ static int build_snap_context(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm, struct list_head *realm_queue, struct list_head *dirty_realms) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_snap_realm *parent = realm->parent; struct ceph_snap_context *snapc; int err = 0; u32 num = realm->num_prior_parent_snaps + realm->num_snaps; /* * build parent context, if it hasn't been built. * conservatively estimate that all parent snaps might be * included by us. */ if (parent) { if (!parent->cached_context) { /* add to the queue head */ list_add(&parent->rebuild_item, realm_queue); return 1; } num += parent->cached_context->num_snaps; } /* do i actually need to update? not if my context seq matches realm seq, and my parents' does to. (this works because we rebuild_snap_realms() works _downward_ in hierarchy after each update.) */ if (realm->cached_context && realm->cached_context->seq == realm->seq && (!parent || realm->cached_context->seq >= parent->cached_context->seq)) { doutc(cl, "%llx %p: %p seq %lld (%u snaps) (unchanged)\n", realm->ino, realm, realm->cached_context, realm->cached_context->seq, (unsigned int)realm->cached_context->num_snaps); return 0; } /* alloc new snap context */ err = -ENOMEM; if ((size_t)num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64)) goto fail; snapc = ceph_create_snap_context(num, GFP_NOFS); if (!snapc) goto fail; /* build (reverse sorted) snap vector */ num = 0; snapc->seq = realm->seq; if (parent) { u32 i; /* include any of parent's snaps occurring _after_ my parent became my parent */ for (i = 0; i < parent->cached_context->num_snaps; i++) if (parent->cached_context->snaps[i] >= realm->parent_since) snapc->snaps[num++] = parent->cached_context->snaps[i]; if (parent->cached_context->seq > snapc->seq) snapc->seq = parent->cached_context->seq; } memcpy(snapc->snaps + num, realm->snaps, sizeof(u64)*realm->num_snaps); num += realm->num_snaps; memcpy(snapc->snaps + num, realm->prior_parent_snaps, sizeof(u64)*realm->num_prior_parent_snaps); num += realm->num_prior_parent_snaps; sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL); snapc->num_snaps = num; doutc(cl, "%llx %p: %p seq %lld (%u snaps)\n", realm->ino, realm, snapc, snapc->seq, (unsigned int) snapc->num_snaps); ceph_put_snap_context(realm->cached_context); realm->cached_context = snapc; /* queue realm for cap_snap creation */ list_add_tail(&realm->dirty_item, dirty_realms); return 0; fail: /* * if we fail, clear old (incorrect) cached_context... hopefully * we'll have better luck building it later */ if (realm->cached_context) { ceph_put_snap_context(realm->cached_context); realm->cached_context = NULL; } pr_err_client(cl, "%llx %p fail %d\n", realm->ino, realm, err); return err; } /* * rebuild snap context for the given realm and all of its children. */ static void rebuild_snap_realms(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm, struct list_head *dirty_realms) { struct ceph_client *cl = mdsc->fsc->client; LIST_HEAD(realm_queue); int last = 0; bool skip = false; list_add_tail(&realm->rebuild_item, &realm_queue); while (!list_empty(&realm_queue)) { struct ceph_snap_realm *_realm, *child; _realm = list_first_entry(&realm_queue, struct ceph_snap_realm, rebuild_item); /* * If the last building failed dues to memory * issue, just empty the realm_queue and return * to avoid infinite loop. */ if (last < 0) { list_del_init(&_realm->rebuild_item); continue; } last = build_snap_context(mdsc, _realm, &realm_queue, dirty_realms); doutc(cl, "%llx %p, %s\n", realm->ino, realm, last > 0 ? "is deferred" : !last ? "succeeded" : "failed"); /* is any child in the list ? */ list_for_each_entry(child, &_realm->children, child_item) { if (!list_empty(&child->rebuild_item)) { skip = true; break; } } if (!skip) { list_for_each_entry(child, &_realm->children, child_item) list_add_tail(&child->rebuild_item, &realm_queue); } /* last == 1 means need to build parent first */ if (last <= 0) list_del_init(&_realm->rebuild_item); } } /* * helper to allocate and decode an array of snapids. free prior * instance, if any. */ static int dup_array(u64 **dst, __le64 *src, u32 num) { u32 i; kfree(*dst); if (num) { *dst = kcalloc(num, sizeof(u64), GFP_NOFS); if (!*dst) return -ENOMEM; for (i = 0; i < num; i++) (*dst)[i] = get_unaligned_le64(src + i); } else { *dst = NULL; } return 0; } static bool has_new_snaps(struct ceph_snap_context *o, struct ceph_snap_context *n) { if (n->num_snaps == 0) return false; /* snaps are in descending order */ return n->snaps[0] > o->seq; } /* * When a snapshot is applied, the size/mtime inode metadata is queued * in a ceph_cap_snap (one for each snapshot) until writeback * completes and the metadata can be flushed back to the MDS. * * However, if a (sync) write is currently in-progress when we apply * the snapshot, we have to wait until the write succeeds or fails * (and a final size/mtime is known). In this case the * cap_snap->writing = 1, and is said to be "pending." When the write * finishes, we __ceph_finish_cap_snap(). * * Caller must hold snap_rwsem for read (i.e., the realm topology won't * change). */ static void ceph_queue_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap **pcapsnap) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_snap_context *old_snapc, *new_snapc; struct ceph_cap_snap *capsnap = *pcapsnap; struct ceph_buffer *old_blob = NULL; int used, dirty; spin_lock(&ci->i_ceph_lock); used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); old_snapc = ci->i_head_snapc; new_snapc = ci->i_snap_realm->cached_context; /* * If there is a write in progress, treat that as a dirty Fw, * even though it hasn't completed yet; by the time we finish * up this capsnap it will be. */ if (used & CEPH_CAP_FILE_WR) dirty |= CEPH_CAP_FILE_WR; if (__ceph_have_pending_cap_snap(ci)) { /* there is no point in queuing multiple "pending" cap_snaps, as no new writes are allowed to start when pending, so any writes in progress now were started before the previous cap_snap. lucky us. */ doutc(cl, "%p %llx.%llx already pending\n", inode, ceph_vinop(inode)); goto update_snapc; } if (ci->i_wrbuffer_ref_head == 0 && !(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) { doutc(cl, "%p %llx.%llx nothing dirty|writing\n", inode, ceph_vinop(inode)); goto update_snapc; } BUG_ON(!old_snapc); /* * There is no need to send FLUSHSNAP message to MDS if there is * no new snapshot. But when there is dirty pages or on-going * writes, we still need to create cap_snap. cap_snap is needed * by the write path and page writeback path. * * also see ceph_try_drop_cap_snap() */ if (has_new_snaps(old_snapc, new_snapc)) { if (dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR)) capsnap->need_flush = true; } else { if (!(used & CEPH_CAP_FILE_WR) && ci->i_wrbuffer_ref_head == 0) { doutc(cl, "%p %llx.%llx no new_snap|dirty_page|writing\n", inode, ceph_vinop(inode)); goto update_snapc; } } doutc(cl, "%p %llx.%llx cap_snap %p queuing under %p %s %s\n", inode, ceph_vinop(inode), capsnap, old_snapc, ceph_cap_string(dirty), capsnap->need_flush ? "" : "no_flush"); ihold(inode); capsnap->follows = old_snapc->seq; capsnap->issued = __ceph_caps_issued(ci, NULL); capsnap->dirty = dirty; capsnap->mode = inode->i_mode; capsnap->uid = inode->i_uid; capsnap->gid = inode->i_gid; if (dirty & CEPH_CAP_XATTR_EXCL) { old_blob = __ceph_build_xattrs_blob(ci); capsnap->xattr_blob = ceph_buffer_get(ci->i_xattrs.blob); capsnap->xattr_version = ci->i_xattrs.version; } else { capsnap->xattr_blob = NULL; capsnap->xattr_version = 0; } capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; /* dirty page count moved from _head to this cap_snap; all subsequent writes page dirties occur _after_ this snapshot. */ capsnap->dirty_pages = ci->i_wrbuffer_ref_head; ci->i_wrbuffer_ref_head = 0; capsnap->context = old_snapc; list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); if (used & CEPH_CAP_FILE_WR) { doutc(cl, "%p %llx.%llx cap_snap %p snapc %p seq %llu used WR," " now pending\n", inode, ceph_vinop(inode), capsnap, old_snapc, old_snapc->seq); capsnap->writing = 1; } else { /* note mtime, size NOW. */ __ceph_finish_cap_snap(ci, capsnap); } *pcapsnap = NULL; old_snapc = NULL; update_snapc: if (ci->i_wrbuffer_ref_head == 0 && ci->i_wr_ref == 0 && ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { ci->i_head_snapc = NULL; } else { ci->i_head_snapc = ceph_get_snap_context(new_snapc); doutc(cl, " new snapc is %p\n", new_snapc); } spin_unlock(&ci->i_ceph_lock); ceph_buffer_put(old_blob); ceph_put_snap_context(old_snapc); } /* * Finalize the size, mtime for a cap_snap.. that is, settle on final values * to be used for the snapshot, to be flushed back to the mds. * * If capsnap can now be flushed, add to snap_flush list, and return 1. * * Caller must hold i_ceph_lock. */ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap) { struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_client *cl = mdsc->fsc->client; BUG_ON(capsnap->writing); capsnap->size = i_size_read(inode); capsnap->mtime = inode_get_mtime(inode); capsnap->atime = inode_get_atime(inode); capsnap->ctime = inode_get_ctime(inode); capsnap->btime = ci->i_btime; capsnap->change_attr = inode_peek_iversion_raw(inode); capsnap->time_warp_seq = ci->i_time_warp_seq; capsnap->truncate_size = ci->i_truncate_size; capsnap->truncate_seq = ci->i_truncate_seq; if (capsnap->dirty_pages) { doutc(cl, "%p %llx.%llx cap_snap %p snapc %p %llu %s " "s=%llu still has %d dirty pages\n", inode, ceph_vinop(inode), capsnap, capsnap->context, capsnap->context->seq, ceph_cap_string(capsnap->dirty), capsnap->size, capsnap->dirty_pages); return 0; } /* * Defer flushing the capsnap if the dirty buffer not flushed yet. * And trigger to flush the buffer immediately. */ if (ci->i_wrbuffer_ref) { doutc(cl, "%p %llx.%llx cap_snap %p snapc %p %llu %s " "s=%llu used WRBUFFER, delaying\n", inode, ceph_vinop(inode), capsnap, capsnap->context, capsnap->context->seq, ceph_cap_string(capsnap->dirty), capsnap->size); ceph_queue_writeback(inode); return 0; } ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; doutc(cl, "%p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu\n", inode, ceph_vinop(inode), capsnap, capsnap->context, capsnap->context->seq, ceph_cap_string(capsnap->dirty), capsnap->size); spin_lock(&mdsc->snap_flush_lock); if (list_empty(&ci->i_snap_flush_item)) { ihold(inode); list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list); } spin_unlock(&mdsc->snap_flush_lock); return 1; /* caller may want to ceph_flush_snaps */ } /* * Queue cap_snaps for snap writeback for this realm and its children. * Called under snap_rwsem, so realm topology won't change. */ static void queue_realm_cap_snaps(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *ci; struct inode *lastinode = NULL; struct ceph_cap_snap *capsnap = NULL; doutc(cl, "%p %llx inode\n", realm, realm->ino); spin_lock(&realm->inodes_with_caps_lock); list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) { struct inode *inode = igrab(&ci->netfs.inode); if (!inode) continue; spin_unlock(&realm->inodes_with_caps_lock); iput(lastinode); lastinode = inode; /* * Allocate the capsnap memory outside of ceph_queue_cap_snap() * to reduce very possible but unnecessary frequently memory * allocate/free in this loop. */ if (!capsnap) { capsnap = kmem_cache_zalloc(ceph_cap_snap_cachep, GFP_NOFS); if (!capsnap) { pr_err_client(cl, "ENOMEM allocating ceph_cap_snap on %p\n", inode); return; } } capsnap->cap_flush.is_capsnap = true; refcount_set(&capsnap->nref, 1); INIT_LIST_HEAD(&capsnap->cap_flush.i_list); INIT_LIST_HEAD(&capsnap->cap_flush.g_list); INIT_LIST_HEAD(&capsnap->ci_item); ceph_queue_cap_snap(ci, &capsnap); spin_lock(&realm->inodes_with_caps_lock); } spin_unlock(&realm->inodes_with_caps_lock); iput(lastinode); if (capsnap) kmem_cache_free(ceph_cap_snap_cachep, capsnap); doutc(cl, "%p %llx done\n", realm, realm->ino); } /* * Parse and apply a snapblob "snap trace" from the MDS. This specifies * the snap realm parameters from a given realm and all of its ancestors, * up to the root. * * Caller must hold snap_rwsem for write. */ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, void *p, void *e, bool deletion, struct ceph_snap_realm **realm_ret) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_snap_realm *ri; /* encoded */ __le64 *snaps; /* encoded */ __le64 *prior_parent_snaps; /* encoded */ struct ceph_snap_realm *realm; struct ceph_snap_realm *first_realm = NULL; struct ceph_snap_realm *realm_to_rebuild = NULL; struct ceph_client *client = mdsc->fsc->client; int rebuild_snapcs; int err = -ENOMEM; int ret; LIST_HEAD(dirty_realms); lockdep_assert_held_write(&mdsc->snap_rwsem); doutc(cl, "deletion=%d\n", deletion); more: realm = NULL; rebuild_snapcs = 0; ceph_decode_need(&p, e, sizeof(*ri), bad); ri = p; p += sizeof(*ri); ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) + le32_to_cpu(ri->num_prior_parent_snaps)), bad); snaps = p; p += sizeof(u64) * le32_to_cpu(ri->num_snaps); prior_parent_snaps = p; p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps); realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino)); if (!realm) { realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino)); if (IS_ERR(realm)) { err = PTR_ERR(realm); goto fail; } } /* ensure the parent is correct */ err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent)); if (err < 0) goto fail; rebuild_snapcs += err; if (le64_to_cpu(ri->seq) > realm->seq) { doutc(cl, "updating %llx %p %lld -> %lld\n", realm->ino, realm, realm->seq, le64_to_cpu(ri->seq)); /* update realm parameters, snap lists */ realm->seq = le64_to_cpu(ri->seq); realm->created = le64_to_cpu(ri->created); realm->parent_since = le64_to_cpu(ri->parent_since); realm->num_snaps = le32_to_cpu(ri->num_snaps); err = dup_array(&realm->snaps, snaps, realm->num_snaps); if (err < 0) goto fail; realm->num_prior_parent_snaps = le32_to_cpu(ri->num_prior_parent_snaps); err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps, realm->num_prior_parent_snaps); if (err < 0) goto fail; if (realm->seq > mdsc->last_snap_seq) mdsc->last_snap_seq = realm->seq; rebuild_snapcs = 1; } else if (!realm->cached_context) { doutc(cl, "%llx %p seq %lld new\n", realm->ino, realm, realm->seq); rebuild_snapcs = 1; } else { doutc(cl, "%llx %p seq %lld unchanged\n", realm->ino, realm, realm->seq); } doutc(cl, "done with %llx %p, rebuild_snapcs=%d, %p %p\n", realm->ino, realm, rebuild_snapcs, p, e); /* * this will always track the uppest parent realm from which * we need to rebuild the snapshot contexts _downward_ in * hierarchy. */ if (rebuild_snapcs) realm_to_rebuild = realm; /* rebuild_snapcs when we reach the _end_ (root) of the trace */ if (realm_to_rebuild && p >= e) rebuild_snap_realms(mdsc, realm_to_rebuild, &dirty_realms); if (!first_realm) first_realm = realm; else ceph_put_snap_realm(mdsc, realm); if (p < e) goto more; /* * queue cap snaps _after_ we've built the new snap contexts, * so that i_head_snapc can be set appropriately. */ while (!list_empty(&dirty_realms)) { realm = list_first_entry(&dirty_realms, struct ceph_snap_realm, dirty_item); list_del_init(&realm->dirty_item); queue_realm_cap_snaps(mdsc, realm); } if (realm_ret) *realm_ret = first_realm; else ceph_put_snap_realm(mdsc, first_realm); __cleanup_empty_realms(mdsc); return 0; bad: err = -EIO; fail: if (realm && !IS_ERR(realm)) ceph_put_snap_realm(mdsc, realm); if (first_realm) ceph_put_snap_realm(mdsc, first_realm); pr_err_client(cl, "error %d\n", err); /* * When receiving a corrupted snap trace we don't know what * exactly has happened in MDS side. And we shouldn't continue * writing to OSD, which may corrupt the snapshot contents. * * Just try to blocklist this kclient and then this kclient * must be remounted to continue after the corrupted metadata * fixed in the MDS side. */ WRITE_ONCE(mdsc->fsc->mount_state, CEPH_MOUNT_FENCE_IO); ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr); if (ret) pr_err_client(cl, "failed to blocklist %s: %d\n", ceph_pr_addr(&client->msgr.inst.addr), ret); WARN(1, "[client.%lld] %s %s%sdo remount to continue%s", client->monc.auth->global_id, __func__, ret ? "" : ceph_pr_addr(&client->msgr.inst.addr), ret ? "" : " was blocklisted, ", err == -EIO ? " after corrupted snaptrace is fixed" : ""); return err; } /* * Send any cap_snaps that are queued for flush. Try to carry * s_mutex across multiple snap flushes to avoid locking overhead. * * Caller holds no locks. */ static void flush_snaps(struct ceph_mds_client *mdsc) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *ci; struct inode *inode; struct ceph_mds_session *session = NULL; doutc(cl, "begin\n"); spin_lock(&mdsc->snap_flush_lock); while (!list_empty(&mdsc->snap_flush_list)) { ci = list_first_entry(&mdsc->snap_flush_list, struct ceph_inode_info, i_snap_flush_item); inode = &ci->netfs.inode; ihold(inode); spin_unlock(&mdsc->snap_flush_lock); ceph_flush_snaps(ci, &session); iput(inode); spin_lock(&mdsc->snap_flush_lock); } spin_unlock(&mdsc->snap_flush_lock); ceph_put_mds_session(session); doutc(cl, "done\n"); } /** * ceph_change_snap_realm - change the snap_realm for an inode * @inode: inode to move to new snap realm * @realm: new realm to move inode into (may be NULL) * * Detach an inode from its old snaprealm (if any) and attach it to * the new snaprealm (if any). The old snap realm reference held by * the inode is put. If realm is non-NULL, then the caller's reference * to it is taken over by the inode. */ void ceph_change_snap_realm(struct inode *inode, struct ceph_snap_realm *realm) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; struct ceph_snap_realm *oldrealm = ci->i_snap_realm; lockdep_assert_held(&ci->i_ceph_lock); if (oldrealm) { spin_lock(&oldrealm->inodes_with_caps_lock); list_del_init(&ci->i_snap_realm_item); if (oldrealm->ino == ci->i_vino.ino) oldrealm->inode = NULL; spin_unlock(&oldrealm->inodes_with_caps_lock); ceph_put_snap_realm(mdsc, oldrealm); } ci->i_snap_realm = realm; if (realm) { spin_lock(&realm->inodes_with_caps_lock); list_add(&ci->i_snap_realm_item, &realm->inodes_with_caps); if (realm->ino == ci->i_vino.ino) realm->inode = inode; spin_unlock(&realm->inodes_with_caps_lock); } } /* * Handle a snap notification from the MDS. * * This can take two basic forms: the simplest is just a snap creation * or deletion notification on an existing realm. This should update the * realm and its children. * * The more difficult case is realm creation, due to snap creation at a * new point in the file hierarchy, or due to a rename that moves a file or * directory into another realm. */ void ceph_handle_snap(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg) { struct ceph_client *cl = mdsc->fsc->client; struct super_block *sb = mdsc->fsc->sb; int mds = session->s_mds; u64 split; int op; int trace_len; struct ceph_snap_realm *realm = NULL; void *p = msg->front.iov_base; void *e = p + msg->front.iov_len; struct ceph_mds_snap_head *h; int num_split_inos, num_split_realms; __le64 *split_inos = NULL, *split_realms = NULL; int i; int locked_rwsem = 0; bool close_sessions = false; if (!ceph_inc_mds_stopping_blocker(mdsc, session)) return; /* decode */ if (msg->front.iov_len < sizeof(*h)) goto bad; h = p; op = le32_to_cpu(h->op); split = le64_to_cpu(h->split); /* non-zero if we are splitting an * existing realm */ num_split_inos = le32_to_cpu(h->num_split_inos); num_split_realms = le32_to_cpu(h->num_split_realms); trace_len = le32_to_cpu(h->trace_len); p += sizeof(*h); doutc(cl, "from mds%d op %s split %llx tracelen %d\n", mds, ceph_snap_op_name(op), split, trace_len); down_write(&mdsc->snap_rwsem); locked_rwsem = 1; if (op == CEPH_SNAP_OP_SPLIT) { struct ceph_mds_snap_realm *ri; /* * A "split" breaks part of an existing realm off into * a new realm. The MDS provides a list of inodes * (with caps) and child realms that belong to the new * child. */ split_inos = p; p += sizeof(u64) * num_split_inos; split_realms = p; p += sizeof(u64) * num_split_realms; ceph_decode_need(&p, e, sizeof(*ri), bad); /* we will peek at realm info here, but will _not_ * advance p, as the realm update will occur below in * ceph_update_snap_trace. */ ri = p; realm = ceph_lookup_snap_realm(mdsc, split); if (!realm) { realm = ceph_create_snap_realm(mdsc, split); if (IS_ERR(realm)) goto out; } doutc(cl, "splitting snap_realm %llx %p\n", realm->ino, realm); for (i = 0; i < num_split_inos; i++) { struct ceph_vino vino = { .ino = le64_to_cpu(split_inos[i]), .snap = CEPH_NOSNAP, }; struct inode *inode = ceph_find_inode(sb, vino); struct ceph_inode_info *ci; if (!inode) continue; ci = ceph_inode(inode); spin_lock(&ci->i_ceph_lock); if (!ci->i_snap_realm) goto skip_inode; /* * If this inode belongs to a realm that was * created after our new realm, we experienced * a race (due to another split notifications * arriving from a different MDS). So skip * this inode. */ if (ci->i_snap_realm->created > le64_to_cpu(ri->created)) { doutc(cl, " leaving %p %llx.%llx in newer realm %llx %p\n", inode, ceph_vinop(inode), ci->i_snap_realm->ino, ci->i_snap_realm); goto skip_inode; } doutc(cl, " will move %p %llx.%llx to split realm %llx %p\n", inode, ceph_vinop(inode), realm->ino, realm); ceph_get_snap_realm(mdsc, realm); ceph_change_snap_realm(inode, realm); spin_unlock(&ci->i_ceph_lock); iput(inode); continue; skip_inode: spin_unlock(&ci->i_ceph_lock); iput(inode); } /* we may have taken some of the old realm's children. */ for (i = 0; i < num_split_realms; i++) { struct ceph_snap_realm *child = __lookup_snap_realm(mdsc, le64_to_cpu(split_realms[i])); if (!child) continue; adjust_snap_realm_parent(mdsc, child, realm->ino); } } else { /* * In the non-split case both 'num_split_inos' and * 'num_split_realms' should be 0, making this a no-op. * However the MDS happens to populate 'split_realms' list * in one of the UPDATE op cases by mistake. * * Skip both lists just in case to ensure that 'p' is * positioned at the start of realm info, as expected by * ceph_update_snap_trace(). */ p += sizeof(u64) * num_split_inos; p += sizeof(u64) * num_split_realms; } /* * update using the provided snap trace. if we are deleting a * snap, we can avoid queueing cap_snaps. */ if (ceph_update_snap_trace(mdsc, p, e, op == CEPH_SNAP_OP_DESTROY, NULL)) { close_sessions = true; goto bad; } if (op == CEPH_SNAP_OP_SPLIT) /* we took a reference when we created the realm, above */ ceph_put_snap_realm(mdsc, realm); __cleanup_empty_realms(mdsc); up_write(&mdsc->snap_rwsem); flush_snaps(mdsc); ceph_dec_mds_stopping_blocker(mdsc); return; bad: pr_err_client(cl, "corrupt snap message from mds%d\n", mds); ceph_msg_dump(msg); out: if (locked_rwsem) up_write(&mdsc->snap_rwsem); ceph_dec_mds_stopping_blocker(mdsc); if (close_sessions) ceph_mdsc_close_sessions(mdsc); return; } struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc, u64 snap) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_snapid_map *sm, *exist; struct rb_node **p, *parent; int ret; exist = NULL; spin_lock(&mdsc->snapid_map_lock); p = &mdsc->snapid_map_tree.rb_node; while (*p) { exist = rb_entry(*p, struct ceph_snapid_map, node); if (snap > exist->snap) { p = &(*p)->rb_left; } else if (snap < exist->snap) { p = &(*p)->rb_right; } else { if (atomic_inc_return(&exist->ref) == 1) list_del_init(&exist->lru); break; } exist = NULL; } spin_unlock(&mdsc->snapid_map_lock); if (exist) { doutc(cl, "found snapid map %llx -> %x\n", exist->snap, exist->dev); return exist; } sm = kmalloc(sizeof(*sm), GFP_NOFS); if (!sm) return NULL; ret = get_anon_bdev(&sm->dev); if (ret < 0) { kfree(sm); return NULL; } INIT_LIST_HEAD(&sm->lru); atomic_set(&sm->ref, 1); sm->snap = snap; exist = NULL; parent = NULL; p = &mdsc->snapid_map_tree.rb_node; spin_lock(&mdsc->snapid_map_lock); while (*p) { parent = *p; exist = rb_entry(*p, struct ceph_snapid_map, node); if (snap > exist->snap) p = &(*p)->rb_left; else if (snap < exist->snap) p = &(*p)->rb_right; else break; exist = NULL; } if (exist) { if (atomic_inc_return(&exist->ref) == 1) list_del_init(&exist->lru); } else { rb_link_node(&sm->node, parent, p); rb_insert_color(&sm->node, &mdsc->snapid_map_tree); } spin_unlock(&mdsc->snapid_map_lock); if (exist) { free_anon_bdev(sm->dev); kfree(sm); doutc(cl, "found snapid map %llx -> %x\n", exist->snap, exist->dev); return exist; } doutc(cl, "create snapid map %llx -> %x\n", sm->snap, sm->dev); return sm; } void ceph_put_snapid_map(struct ceph_mds_client* mdsc, struct ceph_snapid_map *sm) { if (!sm) return; if (atomic_dec_and_lock(&sm->ref, &mdsc->snapid_map_lock)) { if (!RB_EMPTY_NODE(&sm->node)) { sm->last_used = jiffies; list_add_tail(&sm->lru, &mdsc->snapid_map_lru); spin_unlock(&mdsc->snapid_map_lock); } else { /* already cleaned up by * ceph_cleanup_snapid_map() */ spin_unlock(&mdsc->snapid_map_lock); kfree(sm); } } } void ceph_trim_snapid_map(struct ceph_mds_client *mdsc) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_snapid_map *sm; unsigned long now; LIST_HEAD(to_free); spin_lock(&mdsc->snapid_map_lock); now = jiffies; while (!list_empty(&mdsc->snapid_map_lru)) { sm = list_first_entry(&mdsc->snapid_map_lru, struct ceph_snapid_map, lru); if (time_after(sm->last_used + CEPH_SNAPID_MAP_TIMEOUT, now)) break; rb_erase(&sm->node, &mdsc->snapid_map_tree); list_move(&sm->lru, &to_free); } spin_unlock(&mdsc->snapid_map_lock); while (!list_empty(&to_free)) { sm = list_first_entry(&to_free, struct ceph_snapid_map, lru); list_del(&sm->lru); doutc(cl, "trim snapid map %llx -> %x\n", sm->snap, sm->dev); free_anon_bdev(sm->dev); kfree(sm); } } void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_snapid_map *sm; struct rb_node *p; LIST_HEAD(to_free); spin_lock(&mdsc->snapid_map_lock); while ((p = rb_first(&mdsc->snapid_map_tree))) { sm = rb_entry(p, struct ceph_snapid_map, node); rb_erase(p, &mdsc->snapid_map_tree); RB_CLEAR_NODE(p); list_move(&sm->lru, &to_free); } spin_unlock(&mdsc->snapid_map_lock); while (!list_empty(&to_free)) { sm = list_first_entry(&to_free, struct ceph_snapid_map, lru); list_del(&sm->lru); free_anon_bdev(sm->dev); if (WARN_ON_ONCE(atomic_read(&sm->ref))) { pr_err_client(cl, "snapid map %llx -> %x still in use\n", sm->snap, sm->dev); } kfree(sm); } }
55 72 4 54 8 54 14 5 5 3 2 4 2 5 4 6 6 5 4 3 2 3 1 72 72 72 72 72 1 72 72 72 72 72 4 72 80 80 80 6 74 80 3 175 173 1 1 14 14 10 3 16 15 1 8 7 4 7 4 203 10 2 8 10 10 16 16 16 16 16 5 76 76 1 3 69 2 6 9 18 18 8 8 7 5 10 2 9 8 3 29 29 29 1 21 24 1 2 2 4 2 2 7 2 1 1 1 1 1 6 1 1 1 1 7 3 1 1 4 1 1 12 14 14 14 5 3 1 2 3 1 1 3 1 4 3 3 4 2 5 8 1 1 1 1 2 1 1 3 2 1 2 1 1 3 1 16 1 5 1 7 3 6 11 14 1 3 10 1 3 1 1 1 2 3 7 1 2 1 7 1 1 1 1 2 1 1 135 1 2 133 135 203 65 203 147 231 232 232 232 203 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 // SPDX-License-Identifier: GPL-2.0-only /* * Event char devices, giving access to raw input device events. * * Copyright (c) 1999-2002 Vojtech Pavlik */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define EVDEV_MINOR_BASE 64 #define EVDEV_MINORS 32 #define EVDEV_MIN_BUFFER_SIZE 64U #define EVDEV_BUF_PACKETS 8 #include <linux/poll.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/init.h> #include <linux/input/mt.h> #include <linux/major.h> #include <linux/device.h> #include <linux/cdev.h> #include "input-compat.h" struct evdev { int open; struct input_handle handle; struct evdev_client __rcu *grab; struct list_head client_list; spinlock_t client_lock; /* protects client_list */ struct mutex mutex; struct device dev; struct cdev cdev; bool exist; }; struct evdev_client { unsigned int head; unsigned int tail; unsigned int packet_head; /* [future] position of the first element of next packet */ spinlock_t buffer_lock; /* protects access to buffer, head and tail */ wait_queue_head_t wait; struct fasync_struct *fasync; struct evdev *evdev; struct list_head node; enum input_clock_type clk_type; bool revoked; unsigned long *evmasks[EV_CNT]; unsigned int bufsize; struct input_event buffer[] __counted_by(bufsize); }; static size_t evdev_get_mask_cnt(unsigned int type) { static const size_t counts[EV_CNT] = { /* EV_SYN==0 is EV_CNT, _not_ SYN_CNT, see EVIOCGBIT */ [EV_SYN] = EV_CNT, [EV_KEY] = KEY_CNT, [EV_REL] = REL_CNT, [EV_ABS] = ABS_CNT, [EV_MSC] = MSC_CNT, [EV_SW] = SW_CNT, [EV_LED] = LED_CNT, [EV_SND] = SND_CNT, [EV_FF] = FF_CNT, }; return (type < EV_CNT) ? counts[type] : 0; } /* requires the buffer lock to be held */ static bool __evdev_is_filtered(struct evdev_client *client, unsigned int type, unsigned int code) { unsigned long *mask; size_t cnt; /* EV_SYN and unknown codes are never filtered */ if (type == EV_SYN || type >= EV_CNT) return false; /* first test whether the type is filtered */ mask = client->evmasks[0]; if (mask && !test_bit(type, mask)) return true; /* unknown values are never filtered */ cnt = evdev_get_mask_cnt(type); if (!cnt || code >= cnt) return false; mask = client->evmasks[type]; return mask && !test_bit(code, mask); } /* flush queued events of type @type, caller must hold client->buffer_lock */ static void __evdev_flush_queue(struct evdev_client *client, unsigned int type) { unsigned int i, head, num; unsigned int mask = client->bufsize - 1; bool is_report; struct input_event *ev; BUG_ON(type == EV_SYN); head = client->tail; client->packet_head = client->tail; /* init to 1 so a leading SYN_REPORT will not be dropped */ num = 1; for (i = client->tail; i != client->head; i = (i + 1) & mask) { ev = &client->buffer[i]; is_report = ev->type == EV_SYN && ev->code == SYN_REPORT; if (ev->type == type) { /* drop matched entry */ continue; } else if (is_report && !num) { /* drop empty SYN_REPORT groups */ continue; } else if (head != i) { /* move entry to fill the gap */ client->buffer[head] = *ev; } num++; head = (head + 1) & mask; if (is_report) { num = 0; client->packet_head = head; } } client->head = head; } static void __evdev_queue_syn_dropped(struct evdev_client *client) { ktime_t *ev_time = input_get_timestamp(client->evdev->handle.dev); struct timespec64 ts = ktime_to_timespec64(ev_time[client->clk_type]); struct input_event ev; ev.input_event_sec = ts.tv_sec; ev.input_event_usec = ts.tv_nsec / NSEC_PER_USEC; ev.type = EV_SYN; ev.code = SYN_DROPPED; ev.value = 0; client->buffer[client->head++] = ev; client->head &= client->bufsize - 1; if (unlikely(client->head == client->tail)) { /* drop queue but keep our SYN_DROPPED event */ client->tail = (client->head - 1) & (client->bufsize - 1); client->packet_head = client->tail; } } static void evdev_queue_syn_dropped(struct evdev_client *client) { unsigned long flags; spin_lock_irqsave(&client->buffer_lock, flags); __evdev_queue_syn_dropped(client); spin_unlock_irqrestore(&client->buffer_lock, flags); } static int evdev_set_clk_type(struct evdev_client *client, unsigned int clkid) { unsigned long flags; enum input_clock_type clk_type; switch (clkid) { case CLOCK_REALTIME: clk_type = INPUT_CLK_REAL; break; case CLOCK_MONOTONIC: clk_type = INPUT_CLK_MONO; break; case CLOCK_BOOTTIME: clk_type = INPUT_CLK_BOOT; break; default: return -EINVAL; } if (client->clk_type != clk_type) { client->clk_type = clk_type; /* * Flush pending events and queue SYN_DROPPED event, * but only if the queue is not empty. */ spin_lock_irqsave(&client->buffer_lock, flags); if (client->head != client->tail) { client->packet_head = client->head = client->tail; __evdev_queue_syn_dropped(client); } spin_unlock_irqrestore(&client->buffer_lock, flags); } return 0; } static void __pass_event(struct evdev_client *client, const struct input_event *event) { client->buffer[client->head++] = *event; client->head &= client->bufsize - 1; if (unlikely(client->head == client->tail)) { /* * This effectively "drops" all unconsumed events, leaving * EV_SYN/SYN_DROPPED plus the newest event in the queue. */ client->tail = (client->head - 2) & (client->bufsize - 1); client->buffer[client->tail] = (struct input_event) { .input_event_sec = event->input_event_sec, .input_event_usec = event->input_event_usec, .type = EV_SYN, .code = SYN_DROPPED, .value = 0, }; client->packet_head = client->tail; } if (event->type == EV_SYN && event->code == SYN_REPORT) { client->packet_head = client->head; kill_fasync(&client->fasync, SIGIO, POLL_IN); } } static void evdev_pass_values(struct evdev_client *client, const struct input_value *vals, unsigned int count, ktime_t *ev_time) { const struct input_value *v; struct input_event event; struct timespec64 ts; bool wakeup = false; if (client->revoked) return; ts = ktime_to_timespec64(ev_time[client->clk_type]); event.input_event_sec = ts.tv_sec; event.input_event_usec = ts.tv_nsec / NSEC_PER_USEC; /* Interrupts are disabled, just acquire the lock. */ spin_lock(&client->buffer_lock); for (v = vals; v != vals + count; v++) { if (__evdev_is_filtered(client, v->type, v->code)) continue; if (v->type == EV_SYN && v->code == SYN_REPORT) { /* drop empty SYN_REPORT */ if (client->packet_head == client->head) continue; wakeup = true; } event.type = v->type; event.code = v->code; event.value = v->value; __pass_event(client, &event); } spin_unlock(&client->buffer_lock); if (wakeup) wake_up_interruptible_poll(&client->wait, EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM); } /* * Pass incoming events to all connected clients. */ static unsigned int evdev_events(struct input_handle *handle, struct input_value *vals, unsigned int count) { struct evdev *evdev = handle->private; struct evdev_client *client; ktime_t *ev_time = input_get_timestamp(handle->dev); rcu_read_lock(); client = rcu_dereference(evdev->grab); if (client) evdev_pass_values(client, vals, count, ev_time); else list_for_each_entry_rcu(client, &evdev->client_list, node) evdev_pass_values(client, vals, count, ev_time); rcu_read_unlock(); return count; } static int evdev_fasync(int fd, struct file *file, int on) { struct evdev_client *client = file->private_data; return fasync_helper(fd, file, on, &client->fasync); } static void evdev_free(struct device *dev) { struct evdev *evdev = container_of(dev, struct evdev, dev); input_put_device(evdev->handle.dev); kfree(evdev); } /* * Grabs an event device (along with underlying input device). * This function is called with evdev->mutex taken. */ static int evdev_grab(struct evdev *evdev, struct evdev_client *client) { int error; if (evdev->grab) return -EBUSY; error = input_grab_device(&evdev->handle); if (error) return error; rcu_assign_pointer(evdev->grab, client); return 0; } static int evdev_ungrab(struct evdev *evdev, struct evdev_client *client) { struct evdev_client *grab = rcu_dereference_protected(evdev->grab, lockdep_is_held(&evdev->mutex)); if (grab != client) return -EINVAL; rcu_assign_pointer(evdev->grab, NULL); synchronize_rcu(); input_release_device(&evdev->handle); return 0; } static void evdev_attach_client(struct evdev *evdev, struct evdev_client *client) { spin_lock(&evdev->client_lock); list_add_tail_rcu(&client->node, &evdev->client_list); spin_unlock(&evdev->client_lock); } static void evdev_detach_client(struct evdev *evdev, struct evdev_client *client) { spin_lock(&evdev->client_lock); list_del_rcu(&client->node); spin_unlock(&evdev->client_lock); synchronize_rcu(); } static int evdev_open_device(struct evdev *evdev) { int retval; retval = mutex_lock_interruptible(&evdev->mutex); if (retval) return retval; if (!evdev->exist) retval = -ENODEV; else if (!evdev->open++) { retval = input_open_device(&evdev->handle); if (retval) evdev->open--; } mutex_unlock(&evdev->mutex); return retval; } static void evdev_close_device(struct evdev *evdev) { mutex_lock(&evdev->mutex); if (evdev->exist && !--evdev->open) input_close_device(&evdev->handle); mutex_unlock(&evdev->mutex); } /* * Wake up users waiting for IO so they can disconnect from * dead device. */ static void evdev_hangup(struct evdev *evdev) { struct evdev_client *client; spin_lock(&evdev->client_lock); list_for_each_entry(client, &evdev->client_list, node) { kill_fasync(&client->fasync, SIGIO, POLL_HUP); wake_up_interruptible_poll(&client->wait, EPOLLHUP | EPOLLERR); } spin_unlock(&evdev->client_lock); } static int evdev_release(struct inode *inode, struct file *file) { struct evdev_client *client = file->private_data; struct evdev *evdev = client->evdev; unsigned int i; mutex_lock(&evdev->mutex); if (evdev->exist && !client->revoked) input_flush_device(&evdev->handle, file); evdev_ungrab(evdev, client); mutex_unlock(&evdev->mutex); evdev_detach_client(evdev, client); for (i = 0; i < EV_CNT; ++i) bitmap_free(client->evmasks[i]); kvfree(client); evdev_close_device(evdev); return 0; } static unsigned int evdev_compute_buffer_size(struct input_dev *dev) { unsigned int n_events = max(dev->hint_events_per_packet * EVDEV_BUF_PACKETS, EVDEV_MIN_BUFFER_SIZE); return roundup_pow_of_two(n_events); } static int evdev_open(struct inode *inode, struct file *file) { struct evdev *evdev = container_of(inode->i_cdev, struct evdev, cdev); unsigned int bufsize = evdev_compute_buffer_size(evdev->handle.dev); struct evdev_client *client; int error; client = kvzalloc(struct_size(client, buffer, bufsize), GFP_KERNEL); if (!client) return -ENOMEM; init_waitqueue_head(&client->wait); client->bufsize = bufsize; spin_lock_init(&client->buffer_lock); client->evdev = evdev; evdev_attach_client(evdev, client); error = evdev_open_device(evdev); if (error) goto err_free_client; file->private_data = client; stream_open(inode, file); return 0; err_free_client: evdev_detach_client(evdev, client); kvfree(client); return error; } static ssize_t evdev_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { struct evdev_client *client = file->private_data; struct evdev *evdev = client->evdev; struct input_event event; int retval = 0; /* * Limit amount of data we inject into the input subsystem so that * we do not hold evdev->mutex for too long. 4096 bytes corresponds * to 170 input events. */ count = min(count, 4096); if (count != 0 && count < input_event_size()) return -EINVAL; retval = mutex_lock_interruptible(&evdev->mutex); if (retval) return retval; if (!evdev->exist || client->revoked) { retval = -ENODEV; goto out; } while (retval + input_event_size() <= count) { if (input_event_from_user(buffer + retval, &event)) { retval = -EFAULT; goto out; } retval += input_event_size(); input_inject_event(&evdev->handle, event.type, event.code, event.value); cond_resched(); } out: mutex_unlock(&evdev->mutex); return retval; } static int evdev_fetch_next_event(struct evdev_client *client, struct input_event *event) { int have_event; spin_lock_irq(&client->buffer_lock); have_event = client->packet_head != client->tail; if (have_event) { *event = client->buffer[client->tail++]; client->tail &= client->bufsize - 1; } spin_unlock_irq(&client->buffer_lock); return have_event; } static ssize_t evdev_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos) { struct evdev_client *client = file->private_data; struct evdev *evdev = client->evdev; struct input_event event; size_t read = 0; int error; if (count != 0 && count < input_event_size()) return -EINVAL; for (;;) { if (!evdev->exist || client->revoked) return -ENODEV; if (client->packet_head == client->tail && (file->f_flags & O_NONBLOCK)) return -EAGAIN; /* * count == 0 is special - no IO is done but we check * for error conditions (see above). */ if (count == 0) break; while (read + input_event_size() <= count && evdev_fetch_next_event(client, &event)) { if (input_event_to_user(buffer + read, &event)) return -EFAULT; read += input_event_size(); } if (read) break; if (!(file->f_flags & O_NONBLOCK)) { error = wait_event_interruptible(client->wait, client->packet_head != client->tail || !evdev->exist || client->revoked); if (error) return error; } } return read; } /* No kernel lock - fine */ static __poll_t evdev_poll(struct file *file, poll_table *wait) { struct evdev_client *client = file->private_data; struct evdev *evdev = client->evdev; __poll_t mask; poll_wait(file, &client->wait, wait); if (evdev->exist && !client->revoked) mask = EPOLLOUT | EPOLLWRNORM; else mask = EPOLLHUP | EPOLLERR; if (client->packet_head != client->tail) mask |= EPOLLIN | EPOLLRDNORM; return mask; } #ifdef CONFIG_COMPAT #define BITS_PER_LONG_COMPAT (sizeof(compat_long_t) * 8) #define BITS_TO_LONGS_COMPAT(x) ((((x) - 1) / BITS_PER_LONG_COMPAT) + 1) #ifdef __BIG_ENDIAN static int bits_to_user(unsigned long *bits, unsigned int maxbit, unsigned int maxlen, void __user *p, int compat) { int len, i; if (compat) { len = BITS_TO_LONGS_COMPAT(maxbit) * sizeof(compat_long_t); if (len > maxlen) len = maxlen; for (i = 0; i < len / sizeof(compat_long_t); i++) if (copy_to_user((compat_long_t __user *) p + i, (compat_long_t *) bits + i + 1 - ((i % 2) << 1), sizeof(compat_long_t))) return -EFAULT; } else { len = BITS_TO_LONGS(maxbit) * sizeof(long); if (len > maxlen) len = maxlen; if (copy_to_user(p, bits, len)) return -EFAULT; } return len; } static int bits_from_user(unsigned long *bits, unsigned int maxbit, unsigned int maxlen, const void __user *p, int compat) { int len, i; if (compat) { if (maxlen % sizeof(compat_long_t)) return -EINVAL; len = BITS_TO_LONGS_COMPAT(maxbit) * sizeof(compat_long_t); if (len > maxlen) len = maxlen; for (i = 0; i < len / sizeof(compat_long_t); i++) if (copy_from_user((compat_long_t *) bits + i + 1 - ((i % 2) << 1), (compat_long_t __user *) p + i, sizeof(compat_long_t))) return -EFAULT; if (i % 2) *((compat_long_t *) bits + i - 1) = 0; } else { if (maxlen % sizeof(long)) return -EINVAL; len = BITS_TO_LONGS(maxbit) * sizeof(long); if (len > maxlen) len = maxlen; if (copy_from_user(bits, p, len)) return -EFAULT; } return len; } #else static int bits_to_user(unsigned long *bits, unsigned int maxbit, unsigned int maxlen, void __user *p, int compat) { int len = compat ? BITS_TO_LONGS_COMPAT(maxbit) * sizeof(compat_long_t) : BITS_TO_LONGS(maxbit) * sizeof(long); if (len > maxlen) len = maxlen; return copy_to_user(p, bits, len) ? -EFAULT : len; } static int bits_from_user(unsigned long *bits, unsigned int maxbit, unsigned int maxlen, const void __user *p, int compat) { size_t chunk_size = compat ? sizeof(compat_long_t) : sizeof(long); int len; if (maxlen % chunk_size) return -EINVAL; len = compat ? BITS_TO_LONGS_COMPAT(maxbit) : BITS_TO_LONGS(maxbit); len *= chunk_size; if (len > maxlen) len = maxlen; return copy_from_user(bits, p, len) ? -EFAULT : len; } #endif /* __BIG_ENDIAN */ #else static int bits_to_user(unsigned long *bits, unsigned int maxbit, unsigned int maxlen, void __user *p, int compat) { int len = BITS_TO_LONGS(maxbit) * sizeof(long); if (len > maxlen) len = maxlen; return copy_to_user(p, bits, len) ? -EFAULT : len; } static int bits_from_user(unsigned long *bits, unsigned int maxbit, unsigned int maxlen, const void __user *p, int compat) { int len; if (maxlen % sizeof(long)) return -EINVAL; len = BITS_TO_LONGS(maxbit) * sizeof(long); if (len > maxlen) len = maxlen; return copy_from_user(bits, p, len) ? -EFAULT : len; } #endif /* CONFIG_COMPAT */ static int str_to_user(const char *str, unsigned int maxlen, void __user *p) { int len; if (!str) return -ENOENT; len = strlen(str) + 1; if (len > maxlen) len = maxlen; return copy_to_user(p, str, len) ? -EFAULT : len; } static int handle_eviocgbit(struct input_dev *dev, unsigned int type, unsigned int size, void __user *p, int compat_mode) { unsigned long *bits; int len; switch (type) { case 0: bits = dev->evbit; len = EV_MAX; break; case EV_KEY: bits = dev->keybit; len = KEY_MAX; break; case EV_REL: bits = dev->relbit; len = REL_MAX; break; case EV_ABS: bits = dev->absbit; len = ABS_MAX; break; case EV_MSC: bits = dev->mscbit; len = MSC_MAX; break; case EV_LED: bits = dev->ledbit; len = LED_MAX; break; case EV_SND: bits = dev->sndbit; len = SND_MAX; break; case EV_FF: bits = dev->ffbit; len = FF_MAX; break; case EV_SW: bits = dev->swbit; len = SW_MAX; break; default: return -EINVAL; } return bits_to_user(bits, len, size, p, compat_mode); } static int evdev_handle_get_keycode(struct input_dev *dev, void __user *p) { struct input_keymap_entry ke = { .len = sizeof(unsigned int), .flags = 0, }; int __user *ip = (int __user *)p; int error; /* legacy case */ if (copy_from_user(ke.scancode, p, sizeof(unsigned int))) return -EFAULT; error = input_get_keycode(dev, &ke); if (error) return error; if (put_user(ke.keycode, ip + 1)) return -EFAULT; return 0; } static int evdev_handle_get_keycode_v2(struct input_dev *dev, void __user *p) { struct input_keymap_entry ke; int error; if (copy_from_user(&ke, p, sizeof(ke))) return -EFAULT; error = input_get_keycode(dev, &ke); if (error) return error; if (copy_to_user(p, &ke, sizeof(ke))) return -EFAULT; return 0; } static int evdev_handle_set_keycode(struct input_dev *dev, void __user *p) { struct input_keymap_entry ke = { .len = sizeof(unsigned int), .flags = 0, }; int __user *ip = (int __user *)p; if (copy_from_user(ke.scancode, p, sizeof(unsigned int))) return -EFAULT; if (get_user(ke.keycode, ip + 1)) return -EFAULT; return input_set_keycode(dev, &ke); } static int evdev_handle_set_keycode_v2(struct input_dev *dev, void __user *p) { struct input_keymap_entry ke; if (copy_from_user(&ke, p, sizeof(ke))) return -EFAULT; if (ke.len > sizeof(ke.scancode)) return -EINVAL; return input_set_keycode(dev, &ke); } /* * If we transfer state to the user, we should flush all pending events * of the same type from the client's queue. Otherwise, they might end up * with duplicate events, which can screw up client's state tracking. * If bits_to_user fails after flushing the queue, we queue a SYN_DROPPED * event so user-space will notice missing events. * * LOCKING: * We need to take event_lock before buffer_lock to avoid dead-locks. But we * need the even_lock only to guarantee consistent state. We can safely release * it while flushing the queue. This allows input-core to handle filters while * we flush the queue. */ static int evdev_handle_get_val(struct evdev_client *client, struct input_dev *dev, unsigned int type, unsigned long *bits, unsigned int maxbit, unsigned int maxlen, void __user *p, int compat) { int ret; unsigned long *mem; mem = bitmap_alloc(maxbit, GFP_KERNEL); if (!mem) return -ENOMEM; spin_lock_irq(&dev->event_lock); spin_lock(&client->buffer_lock); bitmap_copy(mem, bits, maxbit); spin_unlock(&dev->event_lock); __evdev_flush_queue(client, type); spin_unlock_irq(&client->buffer_lock); ret = bits_to_user(mem, maxbit, maxlen, p, compat); if (ret < 0) evdev_queue_syn_dropped(client); bitmap_free(mem); return ret; } static int evdev_handle_mt_request(struct input_dev *dev, unsigned int size, int __user *ip) { const struct input_mt *mt = dev->mt; unsigned int code; int max_slots; int i; if (get_user(code, &ip[0])) return -EFAULT; if (!mt || !input_is_mt_value(code)) return -EINVAL; max_slots = (size - sizeof(__u32)) / sizeof(__s32); for (i = 0; i < mt->num_slots && i < max_slots; i++) { int value = input_mt_get_value(&mt->slots[i], code); if (put_user(value, &ip[1 + i])) return -EFAULT; } return 0; } static int evdev_revoke(struct evdev *evdev, struct evdev_client *client, struct file *file) { client->revoked = true; evdev_ungrab(evdev, client); input_flush_device(&evdev->handle, file); wake_up_interruptible_poll(&client->wait, EPOLLHUP | EPOLLERR); return 0; } /* must be called with evdev-mutex held */ static int evdev_set_mask(struct evdev_client *client, unsigned int type, const void __user *codes, u32 codes_size, int compat) { unsigned long flags, *mask, *oldmask; size_t cnt; int error; /* we allow unknown types and 'codes_size > size' for forward-compat */ cnt = evdev_get_mask_cnt(type); if (!cnt) return 0; mask = bitmap_zalloc(cnt, GFP_KERNEL); if (!mask) return -ENOMEM; error = bits_from_user(mask, cnt - 1, codes_size, codes, compat); if (error < 0) { bitmap_free(mask); return error; } spin_lock_irqsave(&client->buffer_lock, flags); oldmask = client->evmasks[type]; client->evmasks[type] = mask; spin_unlock_irqrestore(&client->buffer_lock, flags); bitmap_free(oldmask); return 0; } /* must be called with evdev-mutex held */ static int evdev_get_mask(struct evdev_client *client, unsigned int type, void __user *codes, u32 codes_size, int compat) { unsigned long *mask; size_t cnt, size, xfer_size; int i; int error; /* we allow unknown types and 'codes_size > size' for forward-compat */ cnt = evdev_get_mask_cnt(type); size = sizeof(unsigned long) * BITS_TO_LONGS(cnt); xfer_size = min_t(size_t, codes_size, size); if (cnt > 0) { mask = client->evmasks[type]; if (mask) { error = bits_to_user(mask, cnt - 1, xfer_size, codes, compat); if (error < 0) return error; } else { /* fake mask with all bits set */ for (i = 0; i < xfer_size; i++) if (put_user(0xffU, (u8 __user *)codes + i)) return -EFAULT; } } if (xfer_size < codes_size) if (clear_user(codes + xfer_size, codes_size - xfer_size)) return -EFAULT; return 0; } static long evdev_do_ioctl(struct file *file, unsigned int cmd, void __user *p, int compat_mode) { struct evdev_client *client = file->private_data; struct evdev *evdev = client->evdev; struct input_dev *dev = evdev->handle.dev; struct input_absinfo abs; struct input_mask mask; struct ff_effect effect; int __user *ip = (int __user *)p; unsigned int i, t, u, v; unsigned int size; int error; /* First we check for fixed-length commands */ switch (cmd) { case EVIOCGVERSION: return put_user(EV_VERSION, ip); case EVIOCGID: if (copy_to_user(p, &dev->id, sizeof(struct input_id))) return -EFAULT; return 0; case EVIOCGREP: if (!test_bit(EV_REP, dev->evbit)) return -ENOSYS; if (put_user(dev->rep[REP_DELAY], ip)) return -EFAULT; if (put_user(dev->rep[REP_PERIOD], ip + 1)) return -EFAULT; return 0; case EVIOCSREP: if (!test_bit(EV_REP, dev->evbit)) return -ENOSYS; if (get_user(u, ip)) return -EFAULT; if (get_user(v, ip + 1)) return -EFAULT; input_inject_event(&evdev->handle, EV_REP, REP_DELAY, u); input_inject_event(&evdev->handle, EV_REP, REP_PERIOD, v); return 0; case EVIOCRMFF: return input_ff_erase(dev, (int)(unsigned long) p, file); case EVIOCGEFFECTS: i = test_bit(EV_FF, dev->evbit) ? dev->ff->max_effects : 0; if (put_user(i, ip)) return -EFAULT; return 0; case EVIOCGRAB: if (p) return evdev_grab(evdev, client); else return evdev_ungrab(evdev, client); case EVIOCREVOKE: if (p) return -EINVAL; else return evdev_revoke(evdev, client, file); case EVIOCGMASK: { void __user *codes_ptr; if (copy_from_user(&mask, p, sizeof(mask))) return -EFAULT; codes_ptr = (void __user *)(unsigned long)mask.codes_ptr; return evdev_get_mask(client, mask.type, codes_ptr, mask.codes_size, compat_mode); } case EVIOCSMASK: { const void __user *codes_ptr; if (copy_from_user(&mask, p, sizeof(mask))) return -EFAULT; codes_ptr = (const void __user *)(unsigned long)mask.codes_ptr; return evdev_set_mask(client, mask.type, codes_ptr, mask.codes_size, compat_mode); } case EVIOCSCLOCKID: if (copy_from_user(&i, p, sizeof(unsigned int))) return -EFAULT; return evdev_set_clk_type(client, i); case EVIOCGKEYCODE: return evdev_handle_get_keycode(dev, p); case EVIOCSKEYCODE: return evdev_handle_set_keycode(dev, p); case EVIOCGKEYCODE_V2: return evdev_handle_get_keycode_v2(dev, p); case EVIOCSKEYCODE_V2: return evdev_handle_set_keycode_v2(dev, p); } size = _IOC_SIZE(cmd); /* Now check variable-length commands */ #define EVIOC_MASK_SIZE(nr) ((nr) & ~(_IOC_SIZEMASK << _IOC_SIZESHIFT)) switch (EVIOC_MASK_SIZE(cmd)) { case EVIOCGPROP(0): return bits_to_user(dev->propbit, INPUT_PROP_MAX, size, p, compat_mode); case EVIOCGMTSLOTS(0): return evdev_handle_mt_request(dev, size, ip); case EVIOCGKEY(0): return evdev_handle_get_val(client, dev, EV_KEY, dev->key, KEY_MAX, size, p, compat_mode); case EVIOCGLED(0): return evdev_handle_get_val(client, dev, EV_LED, dev->led, LED_MAX, size, p, compat_mode); case EVIOCGSND(0): return evdev_handle_get_val(client, dev, EV_SND, dev->snd, SND_MAX, size, p, compat_mode); case EVIOCGSW(0): return evdev_handle_get_val(client, dev, EV_SW, dev->sw, SW_MAX, size, p, compat_mode); case EVIOCGNAME(0): return str_to_user(dev->name, size, p); case EVIOCGPHYS(0): return str_to_user(dev->phys, size, p); case EVIOCGUNIQ(0): return str_to_user(dev->uniq, size, p); case EVIOC_MASK_SIZE(EVIOCSFF): if (input_ff_effect_from_user(p, size, &effect)) return -EFAULT; error = input_ff_upload(dev, &effect, file); if (error) return error; if (put_user(effect.id, &(((struct ff_effect __user *)p)->id))) return -EFAULT; return 0; } /* Multi-number variable-length handlers */ if (_IOC_TYPE(cmd) != 'E') return -EINVAL; if (_IOC_DIR(cmd) == _IOC_READ) { if ((_IOC_NR(cmd) & ~EV_MAX) == _IOC_NR(EVIOCGBIT(0, 0))) return handle_eviocgbit(dev, _IOC_NR(cmd) & EV_MAX, size, p, compat_mode); if ((_IOC_NR(cmd) & ~ABS_MAX) == _IOC_NR(EVIOCGABS(0))) { if (!dev->absinfo) return -EINVAL; t = _IOC_NR(cmd) & ABS_MAX; abs = dev->absinfo[t]; if (copy_to_user(p, &abs, min_t(size_t, size, sizeof(struct input_absinfo)))) return -EFAULT; return 0; } } if (_IOC_DIR(cmd) == _IOC_WRITE) { if ((_IOC_NR(cmd) & ~ABS_MAX) == _IOC_NR(EVIOCSABS(0))) { if (!dev->absinfo) return -EINVAL; t = _IOC_NR(cmd) & ABS_MAX; if (copy_from_user(&abs, p, min_t(size_t, size, sizeof(struct input_absinfo)))) return -EFAULT; if (size < sizeof(struct input_absinfo)) abs.resolution = 0; /* We can't change number of reserved MT slots */ if (t == ABS_MT_SLOT) return -EINVAL; /* * Take event lock to ensure that we are not * changing device parameters in the middle * of event. */ spin_lock_irq(&dev->event_lock); dev->absinfo[t] = abs; spin_unlock_irq(&dev->event_lock); return 0; } } return -EINVAL; } static long evdev_ioctl_handler(struct file *file, unsigned int cmd, void __user *p, int compat_mode) { struct evdev_client *client = file->private_data; struct evdev *evdev = client->evdev; int retval; retval = mutex_lock_interruptible(&evdev->mutex); if (retval) return retval; if (!evdev->exist || client->revoked) { retval = -ENODEV; goto out; } retval = evdev_do_ioctl(file, cmd, p, compat_mode); out: mutex_unlock(&evdev->mutex); return retval; } static long evdev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return evdev_ioctl_handler(file, cmd, (void __user *)arg, 0); } #ifdef CONFIG_COMPAT static long evdev_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg) { return evdev_ioctl_handler(file, cmd, compat_ptr(arg), 1); } #endif static const struct file_operations evdev_fops = { .owner = THIS_MODULE, .read = evdev_read, .write = evdev_write, .poll = evdev_poll, .open = evdev_open, .release = evdev_release, .unlocked_ioctl = evdev_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = evdev_ioctl_compat, #endif .fasync = evdev_fasync, }; /* * Mark device non-existent. This disables writes, ioctls and * prevents new users from opening the device. Already posted * blocking reads will stay, however new ones will fail. */ static void evdev_mark_dead(struct evdev *evdev) { mutex_lock(&evdev->mutex); evdev->exist = false; mutex_unlock(&evdev->mutex); } static void evdev_cleanup(struct evdev *evdev) { struct input_handle *handle = &evdev->handle; evdev_mark_dead(evdev); evdev_hangup(evdev); /* evdev is marked dead so no one else accesses evdev->open */ if (evdev->open) { input_flush_device(handle, NULL); input_close_device(handle); } } /* * Create new evdev device. Note that input core serializes calls * to connect and disconnect. */ static int evdev_connect(struct input_handler *handler, struct input_dev *dev, const struct input_device_id *id) { struct evdev *evdev; int minor; int dev_no; int error; minor = input_get_new_minor(EVDEV_MINOR_BASE, EVDEV_MINORS, true); if (minor < 0) { error = minor; pr_err("failed to reserve new minor: %d\n", error); return error; } evdev = kzalloc(sizeof(struct evdev), GFP_KERNEL); if (!evdev) { error = -ENOMEM; goto err_free_minor; } INIT_LIST_HEAD(&evdev->client_list); spin_lock_init(&evdev->client_lock); mutex_init(&evdev->mutex); evdev->exist = true; dev_no = minor; /* Normalize device number if it falls into legacy range */ if (dev_no < EVDEV_MINOR_BASE + EVDEV_MINORS) dev_no -= EVDEV_MINOR_BASE; dev_set_name(&evdev->dev, "event%d", dev_no); evdev->handle.dev = input_get_device(dev); evdev->handle.name = dev_name(&evdev->dev); evdev->handle.handler = handler; evdev->handle.private = evdev; evdev->dev.devt = MKDEV(INPUT_MAJOR, minor); evdev->dev.class = &input_class; evdev->dev.parent = &dev->dev; evdev->dev.release = evdev_free; device_initialize(&evdev->dev); error = input_register_handle(&evdev->handle); if (error) goto err_free_evdev; cdev_init(&evdev->cdev, &evdev_fops); error = cdev_device_add(&evdev->cdev, &evdev->dev); if (error) goto err_cleanup_evdev; return 0; err_cleanup_evdev: evdev_cleanup(evdev); input_unregister_handle(&evdev->handle); err_free_evdev: put_device(&evdev->dev); err_free_minor: input_free_minor(minor); return error; } static void evdev_disconnect(struct input_handle *handle) { struct evdev *evdev = handle->private; cdev_device_del(&evdev->cdev, &evdev->dev); evdev_cleanup(evdev); input_free_minor(MINOR(evdev->dev.devt)); input_unregister_handle(handle); put_device(&evdev->dev); } static const struct input_device_id evdev_ids[] = { { /* Matches all devices */ .flags = INPUT_DEVICE_ID_MATCH_EVBIT, .evbit = { BIT_MASK(EV_SYN) }, }, { } /* Terminating zero entry */ }; MODULE_DEVICE_TABLE(input, evdev_ids); static struct input_handler evdev_handler = { .events = evdev_events, .connect = evdev_connect, .disconnect = evdev_disconnect, .legacy_minors = true, .minor = EVDEV_MINOR_BASE, .name = "evdev", .id_table = evdev_ids, }; static int __init evdev_init(void) { return input_register_handler(&evdev_handler); } static void __exit evdev_exit(void) { input_unregister_handler(&evdev_handler); } module_init(evdev_init); module_exit(evdev_exit); MODULE_AUTHOR("Vojtech Pavlik <vojtech@ucw.cz>"); MODULE_DESCRIPTION("Input driver event char devices"); MODULE_LICENSE("GPL");
82 190 18 371 373 135 25 124 257 218 48 17 7 10 17 141 141 2 47 92 24 82 53 8 3 48 82 82 92 92 68 24 3 6 84 30 2 28 4 31 2 43 6 97 30 36 32 74 3 74 1 2 38 35 45 29 2 19 8 3 4 12 5 7 152 97 56 7 14 7 16 3 2 10 19 3 4 17 10 3 3 10 3 5 3 15 10 8 3 4 2 7 12 179 1 1 11 1 1 1 13 8 24 123 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * * Modified by Fred N. van Kempen, 01/29/93, to add line disciplines * which can be dynamically activated and de-activated by the line * discipline handling modules (like SLIP). */ #include <linux/bits.h> #include <linux/types.h> #include <linux/termios.h> #include <linux/errno.h> #include <linux/sched/signal.h> #include <linux/kernel.h> #include <linux/major.h> #include <linux/tty.h> #include <linux/fcntl.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/bitops.h> #include <linux/mutex.h> #include <linux/compat.h> #include <linux/termios_internal.h> #include "tty.h" #include <asm/io.h> #include <linux/uaccess.h> #undef DEBUG /* * Internal flag options for termios setting behavior */ #define TERMIOS_FLUSH BIT(0) #define TERMIOS_WAIT BIT(1) #define TERMIOS_TERMIO BIT(2) #define TERMIOS_OLD BIT(3) /** * tty_chars_in_buffer - characters pending * @tty: terminal * * Returns: the number of bytes of data in the device private output queue. If * no private method is supplied there is assumed to be no queue on the device. */ unsigned int tty_chars_in_buffer(struct tty_struct *tty) { if (tty->ops->chars_in_buffer) return tty->ops->chars_in_buffer(tty); return 0; } EXPORT_SYMBOL(tty_chars_in_buffer); /** * tty_write_room - write queue space * @tty: terminal * * Returns: the number of bytes that can be queued to this device at the present * time. The result should be treated as a guarantee and the driver cannot * offer a value it later shrinks by more than the number of bytes written. If * no method is provided, 2K is always returned and data may be lost as there * will be no flow control. */ unsigned int tty_write_room(struct tty_struct *tty) { if (tty->ops->write_room) return tty->ops->write_room(tty); return 2048; } EXPORT_SYMBOL(tty_write_room); /** * tty_driver_flush_buffer - discard internal buffer * @tty: terminal * * Discard the internal output buffer for this device. If no method is provided, * then either the buffer cannot be hardware flushed or there is no buffer * driver side. */ void tty_driver_flush_buffer(struct tty_struct *tty) { if (tty->ops->flush_buffer) tty->ops->flush_buffer(tty); } EXPORT_SYMBOL(tty_driver_flush_buffer); /** * tty_unthrottle - flow control * @tty: terminal * * Indicate that a @tty may continue transmitting data down the stack. Takes * the &tty_struct->termios_rwsem to protect against parallel * throttle/unthrottle and also to ensure the driver can consistently reference * its own termios data at this point when implementing software flow control. * * Drivers should however remember that the stack can issue a throttle, then * change flow control method, then unthrottle. */ void tty_unthrottle(struct tty_struct *tty) { down_write(&tty->termios_rwsem); if (test_and_clear_bit(TTY_THROTTLED, &tty->flags) && tty->ops->unthrottle) tty->ops->unthrottle(tty); tty->flow_change = TTY_FLOW_NO_CHANGE; up_write(&tty->termios_rwsem); } EXPORT_SYMBOL(tty_unthrottle); /** * tty_throttle_safe - flow control * @tty: terminal * * Indicate that a @tty should stop transmitting data down the stack. * tty_throttle_safe() will only attempt throttle if @tty->flow_change is * %TTY_THROTTLE_SAFE. Prevents an accidental throttle due to race conditions * when throttling is conditional on factors evaluated prior to throttling. * * Returns: %true if @tty is throttled (or was already throttled) */ bool tty_throttle_safe(struct tty_struct *tty) { guard(mutex)(&tty->throttle_mutex); if (tty_throttled(tty)) return true; if (tty->flow_change != TTY_THROTTLE_SAFE) return false; set_bit(TTY_THROTTLED, &tty->flags); if (tty->ops->throttle) tty->ops->throttle(tty); return true; } /** * tty_unthrottle_safe - flow control * @tty: terminal * * Similar to tty_unthrottle() but will only attempt unthrottle if * @tty->flow_change is %TTY_UNTHROTTLE_SAFE. Prevents an accidental unthrottle * due to race conditions when unthrottling is conditional on factors evaluated * prior to unthrottling. * * Returns: %true if @tty is unthrottled (or was already unthrottled) */ bool tty_unthrottle_safe(struct tty_struct *tty) { guard(mutex)(&tty->throttle_mutex); if (!tty_throttled(tty)) return true; if (tty->flow_change != TTY_UNTHROTTLE_SAFE) return false; clear_bit(TTY_THROTTLED, &tty->flags); if (tty->ops->unthrottle) tty->ops->unthrottle(tty); return true; } /** * tty_wait_until_sent - wait for I/O to finish * @tty: tty we are waiting for * @timeout: how long we will wait * * Wait for characters pending in a tty driver to hit the wire, or for a * timeout to occur (eg due to flow control). * * Locking: none */ void tty_wait_until_sent(struct tty_struct *tty, long timeout) { if (!timeout) timeout = MAX_SCHEDULE_TIMEOUT; timeout = wait_event_interruptible_timeout(tty->write_wait, !tty_chars_in_buffer(tty), timeout); if (timeout <= 0) return; if (timeout == MAX_SCHEDULE_TIMEOUT) timeout = 0; if (tty->ops->wait_until_sent) tty->ops->wait_until_sent(tty, timeout); } EXPORT_SYMBOL(tty_wait_until_sent); /* * Termios Helper Methods */ static void unset_locked_termios(struct tty_struct *tty, const struct ktermios *old) { struct ktermios *termios = &tty->termios; struct ktermios *locked = &tty->termios_locked; int i; #define NOSET_MASK(x, y, z) (x = ((x) & ~(z)) | ((y) & (z))) NOSET_MASK(termios->c_iflag, old->c_iflag, locked->c_iflag); NOSET_MASK(termios->c_oflag, old->c_oflag, locked->c_oflag); NOSET_MASK(termios->c_cflag, old->c_cflag, locked->c_cflag); NOSET_MASK(termios->c_lflag, old->c_lflag, locked->c_lflag); termios->c_line = locked->c_line ? old->c_line : termios->c_line; for (i = 0; i < NCCS; i++) termios->c_cc[i] = locked->c_cc[i] ? old->c_cc[i] : termios->c_cc[i]; /* FIXME: What should we do for i/ospeed */ } /** * tty_termios_copy_hw - copy hardware settings * @new: new termios * @old: old termios * * Propagate the hardware specific terminal setting bits from the @old termios * structure to the @new one. This is used in cases where the hardware does not * support reconfiguration or as a helper in some cases where only minimal * reconfiguration is supported. */ void tty_termios_copy_hw(struct ktermios *new, const struct ktermios *old) { /* The bits a dumb device handles in software. Smart devices need to always provide a set_termios method */ new->c_cflag &= HUPCL | CREAD | CLOCAL; new->c_cflag |= old->c_cflag & ~(HUPCL | CREAD | CLOCAL); new->c_ispeed = old->c_ispeed; new->c_ospeed = old->c_ospeed; } EXPORT_SYMBOL(tty_termios_copy_hw); /** * tty_termios_hw_change - check for setting change * @a: termios * @b: termios to compare * * Check if any of the bits that affect a dumb device have changed between the * two termios structures, or a speed change is needed. * * Returns: %true if change is needed */ bool tty_termios_hw_change(const struct ktermios *a, const struct ktermios *b) { if (a->c_ispeed != b->c_ispeed || a->c_ospeed != b->c_ospeed) return true; if ((a->c_cflag ^ b->c_cflag) & ~(HUPCL | CREAD | CLOCAL)) return true; return false; } EXPORT_SYMBOL(tty_termios_hw_change); /** * tty_get_char_size - get size of a character * @cflag: termios cflag value * * Returns: size (in bits) of a character depending on @cflag's %CSIZE setting */ unsigned char tty_get_char_size(unsigned int cflag) { switch (cflag & CSIZE) { case CS5: return 5; case CS6: return 6; case CS7: return 7; case CS8: default: return 8; } } EXPORT_SYMBOL_GPL(tty_get_char_size); /** * tty_get_frame_size - get size of a frame * @cflag: termios cflag value * * Get the size (in bits) of a frame depending on @cflag's %CSIZE, %CSTOPB, and * %PARENB setting. The result is a sum of character size, start and stop bits * -- one bit each -- second stop bit (if set), and parity bit (if set). * * Returns: size (in bits) of a frame depending on @cflag's setting. */ unsigned char tty_get_frame_size(unsigned int cflag) { unsigned char bits = 2 + tty_get_char_size(cflag); if (cflag & CSTOPB) bits++; if (cflag & PARENB) bits++; if (cflag & ADDRB) bits++; return bits; } EXPORT_SYMBOL_GPL(tty_get_frame_size); /** * tty_set_termios - update termios values * @tty: tty to update * @new_termios: desired new value * * Perform updates to the termios values set on this @tty. A master pty's * termios should never be set. * * Locking: &tty_struct->termios_rwsem */ int tty_set_termios(struct tty_struct *tty, struct ktermios *new_termios) { struct ktermios old_termios; struct tty_ldisc *ld; WARN_ON(tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER); /* * Perform the actual termios internal changes under lock. */ /* FIXME: we need to decide on some locking/ordering semantics for the set_termios notification eventually */ down_write(&tty->termios_rwsem); old_termios = tty->termios; tty->termios = *new_termios; unset_locked_termios(tty, &old_termios); /* Reset any ADDRB changes, ADDRB is changed through ->rs485_config() */ tty->termios.c_cflag ^= (tty->termios.c_cflag ^ old_termios.c_cflag) & ADDRB; if (tty->ops->set_termios) tty->ops->set_termios(tty, &old_termios); else tty_termios_copy_hw(&tty->termios, &old_termios); ld = tty_ldisc_ref(tty); if (ld != NULL) { if (ld->ops->set_termios) ld->ops->set_termios(tty, &old_termios); tty_ldisc_deref(ld); } up_write(&tty->termios_rwsem); return 0; } EXPORT_SYMBOL_GPL(tty_set_termios); /* * Translate a "termio" structure into a "termios". Ugh. */ __weak int user_termio_to_kernel_termios(struct ktermios *termios, struct termio __user *termio) { struct termio v; if (copy_from_user(&v, termio, sizeof(struct termio))) return -EFAULT; termios->c_iflag = (0xffff0000 & termios->c_iflag) | v.c_iflag; termios->c_oflag = (0xffff0000 & termios->c_oflag) | v.c_oflag; termios->c_cflag = (0xffff0000 & termios->c_cflag) | v.c_cflag; termios->c_lflag = (0xffff0000 & termios->c_lflag) | v.c_lflag; termios->c_line = (0xffff0000 & termios->c_lflag) | v.c_line; memcpy(termios->c_cc, v.c_cc, NCC); return 0; } /* * Translate a "termios" structure into a "termio". Ugh. */ __weak int kernel_termios_to_user_termio(struct termio __user *termio, struct ktermios *termios) { struct termio v; memset(&v, 0, sizeof(struct termio)); v.c_iflag = termios->c_iflag; v.c_oflag = termios->c_oflag; v.c_cflag = termios->c_cflag; v.c_lflag = termios->c_lflag; v.c_line = termios->c_line; memcpy(v.c_cc, termios->c_cc, NCC); return copy_to_user(termio, &v, sizeof(struct termio)); } #ifdef TCGETS2 __weak int user_termios_to_kernel_termios(struct ktermios *k, struct termios2 __user *u) { return copy_from_user(k, u, sizeof(struct termios2)); } __weak int kernel_termios_to_user_termios(struct termios2 __user *u, struct ktermios *k) { return copy_to_user(u, k, sizeof(struct termios2)); } __weak int user_termios_to_kernel_termios_1(struct ktermios *k, struct termios __user *u) { return copy_from_user(k, u, sizeof(struct termios)); } __weak int kernel_termios_to_user_termios_1(struct termios __user *u, struct ktermios *k) { return copy_to_user(u, k, sizeof(struct termios)); } #else __weak int user_termios_to_kernel_termios(struct ktermios *k, struct termios __user *u) { return copy_from_user(k, u, sizeof(struct termios)); } __weak int kernel_termios_to_user_termios(struct termios __user *u, struct ktermios *k) { return copy_to_user(u, k, sizeof(struct termios)); } #endif /* TCGETS2 */ /** * set_termios - set termios values for a tty * @tty: terminal device * @arg: user data * @opt: option information * * Helper function to prepare termios data and run necessary other functions * before using tty_set_termios() to do the actual changes. * * Locking: called functions take &tty_struct->ldisc_sem and * &tty_struct->termios_rwsem locks * * Returns: 0 on success, an error otherwise */ static int set_termios(struct tty_struct *tty, void __user *arg, int opt) { struct ktermios tmp_termios; struct tty_ldisc *ld; int retval = tty_check_change(tty); if (retval) return retval; down_read(&tty->termios_rwsem); tmp_termios = tty->termios; up_read(&tty->termios_rwsem); if (opt & TERMIOS_TERMIO) { if (user_termio_to_kernel_termios(&tmp_termios, (struct termio __user *)arg)) return -EFAULT; #ifdef TCGETS2 } else if (opt & TERMIOS_OLD) { if (user_termios_to_kernel_termios_1(&tmp_termios, (struct termios __user *)arg)) return -EFAULT; } else { if (user_termios_to_kernel_termios(&tmp_termios, (struct termios2 __user *)arg)) return -EFAULT; } #else } else if (user_termios_to_kernel_termios(&tmp_termios, (struct termios __user *)arg)) return -EFAULT; #endif /* If old style Bfoo values are used then load c_ispeed/c_ospeed * with the real speed so its unconditionally usable */ tmp_termios.c_ispeed = tty_termios_input_baud_rate(&tmp_termios); tmp_termios.c_ospeed = tty_termios_baud_rate(&tmp_termios); if (opt & (TERMIOS_FLUSH|TERMIOS_WAIT)) { retry_write_wait: retval = wait_event_interruptible(tty->write_wait, !tty_chars_in_buffer(tty)); if (retval < 0) return retval; if (tty_write_lock(tty, false) < 0) goto retry_write_wait; /* Racing writer? */ if (tty_chars_in_buffer(tty)) { tty_write_unlock(tty); goto retry_write_wait; } ld = tty_ldisc_ref(tty); if (ld != NULL) { if ((opt & TERMIOS_FLUSH) && ld->ops->flush_buffer) ld->ops->flush_buffer(tty); tty_ldisc_deref(ld); } if ((opt & TERMIOS_WAIT) && tty->ops->wait_until_sent) { tty->ops->wait_until_sent(tty, 0); if (signal_pending(current)) { tty_write_unlock(tty); return -ERESTARTSYS; } } tty_set_termios(tty, &tmp_termios); tty_write_unlock(tty); } else { tty_set_termios(tty, &tmp_termios); } /* FIXME: Arguably if tmp_termios == tty->termios AND the actual requested termios was not tmp_termios then we may want to return an error as no user requested change has succeeded */ return 0; } static void copy_termios(struct tty_struct *tty, struct ktermios *kterm) { down_read(&tty->termios_rwsem); *kterm = tty->termios; up_read(&tty->termios_rwsem); } static void copy_termios_locked(struct tty_struct *tty, struct ktermios *kterm) { down_read(&tty->termios_rwsem); *kterm = tty->termios_locked; up_read(&tty->termios_rwsem); } static int get_termio(struct tty_struct *tty, struct termio __user *termio) { struct ktermios kterm; copy_termios(tty, &kterm); if (kernel_termios_to_user_termio(termio, &kterm)) return -EFAULT; return 0; } #ifdef TIOCGETP /* * These are deprecated, but there is limited support.. * * The "sg_flags" translation is a joke.. */ static int get_sgflags(struct tty_struct *tty) { int flags = 0; if (!L_ICANON(tty)) { if (L_ISIG(tty)) flags |= 0x02; /* cbreak */ else flags |= 0x20; /* raw */ } if (L_ECHO(tty)) flags |= 0x08; /* echo */ if (O_OPOST(tty)) if (O_ONLCR(tty)) flags |= 0x10; /* crmod */ return flags; } static int get_sgttyb(struct tty_struct *tty, struct sgttyb __user *sgttyb) { struct sgttyb tmp; down_read(&tty->termios_rwsem); tmp.sg_ispeed = tty->termios.c_ispeed; tmp.sg_ospeed = tty->termios.c_ospeed; tmp.sg_erase = tty->termios.c_cc[VERASE]; tmp.sg_kill = tty->termios.c_cc[VKILL]; tmp.sg_flags = get_sgflags(tty); up_read(&tty->termios_rwsem); return copy_to_user(sgttyb, &tmp, sizeof(tmp)) ? -EFAULT : 0; } static void set_sgflags(struct ktermios *termios, int flags) { termios->c_iflag = ICRNL | IXON; termios->c_oflag = 0; termios->c_lflag = ISIG | ICANON; if (flags & 0x02) { /* cbreak */ termios->c_iflag = 0; termios->c_lflag &= ~ICANON; } if (flags & 0x08) { /* echo */ termios->c_lflag |= ECHO | ECHOE | ECHOK | ECHOCTL | ECHOKE | IEXTEN; } if (flags & 0x10) { /* crmod */ termios->c_oflag |= OPOST | ONLCR; } if (flags & 0x20) { /* raw */ termios->c_iflag = 0; termios->c_lflag &= ~(ISIG | ICANON); } if (!(termios->c_lflag & ICANON)) { termios->c_cc[VMIN] = 1; termios->c_cc[VTIME] = 0; } } /** * set_sgttyb - set legacy terminal values * @tty: tty structure * @sgttyb: pointer to old style terminal structure * * Updates a terminal from the legacy BSD style terminal information structure. * * Locking: &tty_struct->termios_rwsem * * Returns: 0 on success, an error otherwise */ static int set_sgttyb(struct tty_struct *tty, struct sgttyb __user *sgttyb) { int retval; struct sgttyb tmp; struct ktermios termios; retval = tty_check_change(tty); if (retval) return retval; if (copy_from_user(&tmp, sgttyb, sizeof(tmp))) return -EFAULT; down_write(&tty->termios_rwsem); termios = tty->termios; termios.c_cc[VERASE] = tmp.sg_erase; termios.c_cc[VKILL] = tmp.sg_kill; set_sgflags(&termios, tmp.sg_flags); /* Try and encode into Bfoo format */ tty_termios_encode_baud_rate(&termios, termios.c_ispeed, termios.c_ospeed); up_write(&tty->termios_rwsem); tty_set_termios(tty, &termios); return 0; } #endif #ifdef TIOCGETC static int get_tchars(struct tty_struct *tty, struct tchars __user *tchars) { struct tchars tmp; down_read(&tty->termios_rwsem); tmp.t_intrc = tty->termios.c_cc[VINTR]; tmp.t_quitc = tty->termios.c_cc[VQUIT]; tmp.t_startc = tty->termios.c_cc[VSTART]; tmp.t_stopc = tty->termios.c_cc[VSTOP]; tmp.t_eofc = tty->termios.c_cc[VEOF]; tmp.t_brkc = tty->termios.c_cc[VEOL2]; /* what is brkc anyway? */ up_read(&tty->termios_rwsem); return copy_to_user(tchars, &tmp, sizeof(tmp)) ? -EFAULT : 0; } static int set_tchars(struct tty_struct *tty, struct tchars __user *tchars) { struct tchars tmp; if (copy_from_user(&tmp, tchars, sizeof(tmp))) return -EFAULT; down_write(&tty->termios_rwsem); tty->termios.c_cc[VINTR] = tmp.t_intrc; tty->termios.c_cc[VQUIT] = tmp.t_quitc; tty->termios.c_cc[VSTART] = tmp.t_startc; tty->termios.c_cc[VSTOP] = tmp.t_stopc; tty->termios.c_cc[VEOF] = tmp.t_eofc; tty->termios.c_cc[VEOL2] = tmp.t_brkc; /* what is brkc anyway? */ up_write(&tty->termios_rwsem); return 0; } #endif #ifdef TIOCGLTC static int get_ltchars(struct tty_struct *tty, struct ltchars __user *ltchars) { struct ltchars tmp; down_read(&tty->termios_rwsem); tmp.t_suspc = tty->termios.c_cc[VSUSP]; /* what is dsuspc anyway? */ tmp.t_dsuspc = tty->termios.c_cc[VSUSP]; tmp.t_rprntc = tty->termios.c_cc[VREPRINT]; /* what is flushc anyway? */ tmp.t_flushc = tty->termios.c_cc[VEOL2]; tmp.t_werasc = tty->termios.c_cc[VWERASE]; tmp.t_lnextc = tty->termios.c_cc[VLNEXT]; up_read(&tty->termios_rwsem); return copy_to_user(ltchars, &tmp, sizeof(tmp)) ? -EFAULT : 0; } static int set_ltchars(struct tty_struct *tty, struct ltchars __user *ltchars) { struct ltchars tmp; if (copy_from_user(&tmp, ltchars, sizeof(tmp))) return -EFAULT; down_write(&tty->termios_rwsem); tty->termios.c_cc[VSUSP] = tmp.t_suspc; /* what is dsuspc anyway? */ tty->termios.c_cc[VEOL2] = tmp.t_dsuspc; tty->termios.c_cc[VREPRINT] = tmp.t_rprntc; /* what is flushc anyway? */ tty->termios.c_cc[VEOL2] = tmp.t_flushc; tty->termios.c_cc[VWERASE] = tmp.t_werasc; tty->termios.c_cc[VLNEXT] = tmp.t_lnextc; up_write(&tty->termios_rwsem); return 0; } #endif /** * tty_change_softcar - carrier change ioctl helper * @tty: tty to update * @enable: enable/disable %CLOCAL * * Perform a change to the %CLOCAL state and call into the driver layer to make * it visible. * * Locking: &tty_struct->termios_rwsem. * * Returns: 0 on success, an error otherwise */ static int tty_change_softcar(struct tty_struct *tty, bool enable) { int ret = 0; struct ktermios old; tcflag_t bit = enable ? CLOCAL : 0; down_write(&tty->termios_rwsem); old = tty->termios; tty->termios.c_cflag &= ~CLOCAL; tty->termios.c_cflag |= bit; if (tty->ops->set_termios) tty->ops->set_termios(tty, &old); if (C_CLOCAL(tty) != bit) ret = -EINVAL; up_write(&tty->termios_rwsem); return ret; } /** * tty_mode_ioctl - mode related ioctls * @tty: tty for the ioctl * @cmd: command * @arg: ioctl argument * * Perform non-line discipline specific mode control ioctls. This is designed * to be called by line disciplines to ensure they provide consistent mode * setting. */ int tty_mode_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { struct tty_struct *real_tty; void __user *p = (void __user *)arg; int ret = 0; struct ktermios kterm; if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER) real_tty = tty->link; else real_tty = tty; switch (cmd) { #ifdef TIOCGETP case TIOCGETP: return get_sgttyb(real_tty, (struct sgttyb __user *) arg); case TIOCSETP: case TIOCSETN: return set_sgttyb(real_tty, (struct sgttyb __user *) arg); #endif #ifdef TIOCGETC case TIOCGETC: return get_tchars(real_tty, p); case TIOCSETC: return set_tchars(real_tty, p); #endif #ifdef TIOCGLTC case TIOCGLTC: return get_ltchars(real_tty, p); case TIOCSLTC: return set_ltchars(real_tty, p); #endif case TCSETSF: return set_termios(real_tty, p, TERMIOS_FLUSH | TERMIOS_WAIT | TERMIOS_OLD); case TCSETSW: return set_termios(real_tty, p, TERMIOS_WAIT | TERMIOS_OLD); case TCSETS: return set_termios(real_tty, p, TERMIOS_OLD); #ifndef TCGETS2 case TCGETS: copy_termios(real_tty, &kterm); if (kernel_termios_to_user_termios((struct termios __user *)arg, &kterm)) ret = -EFAULT; return ret; #else case TCGETS: copy_termios(real_tty, &kterm); if (kernel_termios_to_user_termios_1((struct termios __user *)arg, &kterm)) ret = -EFAULT; return ret; case TCGETS2: copy_termios(real_tty, &kterm); if (kernel_termios_to_user_termios((struct termios2 __user *)arg, &kterm)) ret = -EFAULT; return ret; case TCSETSF2: return set_termios(real_tty, p, TERMIOS_FLUSH | TERMIOS_WAIT); case TCSETSW2: return set_termios(real_tty, p, TERMIOS_WAIT); case TCSETS2: return set_termios(real_tty, p, 0); #endif case TCGETA: return get_termio(real_tty, p); case TCSETAF: return set_termios(real_tty, p, TERMIOS_FLUSH | TERMIOS_WAIT | TERMIOS_TERMIO); case TCSETAW: return set_termios(real_tty, p, TERMIOS_WAIT | TERMIOS_TERMIO); case TCSETA: return set_termios(real_tty, p, TERMIOS_TERMIO); #ifndef TCGETS2 case TIOCGLCKTRMIOS: copy_termios_locked(real_tty, &kterm); if (kernel_termios_to_user_termios((struct termios __user *)arg, &kterm)) ret = -EFAULT; return ret; case TIOCSLCKTRMIOS: if (!checkpoint_restore_ns_capable(&init_user_ns)) return -EPERM; copy_termios_locked(real_tty, &kterm); if (user_termios_to_kernel_termios(&kterm, (struct termios __user *) arg)) return -EFAULT; down_write(&real_tty->termios_rwsem); real_tty->termios_locked = kterm; up_write(&real_tty->termios_rwsem); return 0; #else case TIOCGLCKTRMIOS: copy_termios_locked(real_tty, &kterm); if (kernel_termios_to_user_termios_1((struct termios __user *)arg, &kterm)) ret = -EFAULT; return ret; case TIOCSLCKTRMIOS: if (!checkpoint_restore_ns_capable(&init_user_ns)) return -EPERM; copy_termios_locked(real_tty, &kterm); if (user_termios_to_kernel_termios_1(&kterm, (struct termios __user *) arg)) return -EFAULT; down_write(&real_tty->termios_rwsem); real_tty->termios_locked = kterm; up_write(&real_tty->termios_rwsem); return ret; #endif #ifdef TCGETX case TCGETX: case TCSETX: case TCSETXW: case TCSETXF: return -ENOTTY; #endif case TIOCGSOFTCAR: copy_termios(real_tty, &kterm); ret = put_user((kterm.c_cflag & CLOCAL) ? 1 : 0, (int __user *)arg); return ret; case TIOCSSOFTCAR: if (get_user(arg, (unsigned int __user *) arg)) return -EFAULT; return tty_change_softcar(real_tty, arg); default: return -ENOIOCTLCMD; } } EXPORT_SYMBOL_GPL(tty_mode_ioctl); /* Caller guarantees ldisc reference is held */ static int __tty_perform_flush(struct tty_struct *tty, unsigned long arg) { struct tty_ldisc *ld = tty->ldisc; switch (arg) { case TCIFLUSH: if (ld && ld->ops->flush_buffer) { ld->ops->flush_buffer(tty); tty_unthrottle(tty); } break; case TCIOFLUSH: if (ld && ld->ops->flush_buffer) { ld->ops->flush_buffer(tty); tty_unthrottle(tty); } fallthrough; case TCOFLUSH: tty_driver_flush_buffer(tty); break; default: return -EINVAL; } return 0; } int tty_perform_flush(struct tty_struct *tty, unsigned long arg) { struct tty_ldisc *ld; int retval = tty_check_change(tty); if (retval) return retval; ld = tty_ldisc_ref_wait(tty); retval = __tty_perform_flush(tty, arg); if (ld) tty_ldisc_deref(ld); return retval; } EXPORT_SYMBOL_GPL(tty_perform_flush); int n_tty_ioctl_helper(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { int retval; switch (cmd) { case TCXONC: retval = tty_check_change(tty); if (retval) return retval; switch (arg) { case TCOOFF: spin_lock_irq(&tty->flow.lock); if (!tty->flow.tco_stopped) { tty->flow.tco_stopped = true; __stop_tty(tty); } spin_unlock_irq(&tty->flow.lock); break; case TCOON: spin_lock_irq(&tty->flow.lock); if (tty->flow.tco_stopped) { tty->flow.tco_stopped = false; __start_tty(tty); } spin_unlock_irq(&tty->flow.lock); break; case TCIOFF: if (STOP_CHAR(tty) != __DISABLED_CHAR) retval = tty_send_xchar(tty, STOP_CHAR(tty)); break; case TCION: if (START_CHAR(tty) != __DISABLED_CHAR) retval = tty_send_xchar(tty, START_CHAR(tty)); break; default: return -EINVAL; } return retval; case TCFLSH: retval = tty_check_change(tty); if (retval) return retval; return __tty_perform_flush(tty, arg); default: /* Try the mode commands */ return tty_mode_ioctl(tty, cmd, arg); } } EXPORT_SYMBOL(n_tty_ioctl_helper);
8 3 6 7 8 6 8 8 356 10 357 357 357 191 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 /* SPDX-License-Identifier: GPL-2.0 * * page_pool/helpers.h * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> * Copyright (C) 2016 Red Hat, Inc. */ /** * DOC: page_pool allocator * * The page_pool allocator is optimized for recycling page or page fragment used * by skb packet and xdp frame. * * Basic use involves replacing any alloc_pages() calls with page_pool_alloc(), * which allocate memory with or without page splitting depending on the * requested memory size. * * If the driver knows that it always requires full pages or its allocations are * always smaller than half a page, it can use one of the more specific API * calls: * * 1. page_pool_alloc_pages(): allocate memory without page splitting when * driver knows that the memory it need is always bigger than half of the page * allocated from page pool. There is no cache line dirtying for 'struct page' * when a page is recycled back to the page pool. * * 2. page_pool_alloc_frag(): allocate memory with page splitting when driver * knows that the memory it need is always smaller than or equal to half of the * page allocated from page pool. Page splitting enables memory saving and thus * avoids TLB/cache miss for data access, but there also is some cost to * implement page splitting, mainly some cache line dirtying/bouncing for * 'struct page' and atomic operation for page->pp_ref_count. * * The API keeps track of in-flight pages, in order to let API users know when * it is safe to free a page_pool object, the API users must call * page_pool_put_page() or page_pool_free_va() to free the page_pool object, or * attach the page_pool object to a page_pool-aware object like skbs marked with * skb_mark_for_recycle(). * * page_pool_put_page() may be called multiple times on the same page if a page * is split into multiple fragments. For the last fragment, it will either * recycle the page, or in case of page->_refcount > 1, it will release the DMA * mapping and in-flight state accounting. * * dma_sync_single_range_for_device() is only called for the last fragment when * page_pool is created with PP_FLAG_DMA_SYNC_DEV flag, so it depends on the * last freed fragment to do the sync_for_device operation for all fragments in * the same page when a page is split. The API user must setup pool->p.max_len * and pool->p.offset correctly and ensure that page_pool_put_page() is called * with dma_sync_size being -1 for fragment API. */ #ifndef _NET_PAGE_POOL_HELPERS_H #define _NET_PAGE_POOL_HELPERS_H #include <linux/dma-mapping.h> #include <net/page_pool/types.h> #include <net/net_debug.h> #include <net/netmem.h> #ifdef CONFIG_PAGE_POOL_STATS /* Deprecated driver-facing API, use netlink instead */ int page_pool_ethtool_stats_get_count(void); u8 *page_pool_ethtool_stats_get_strings(u8 *data); u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats); bool page_pool_get_stats(const struct page_pool *pool, struct page_pool_stats *stats); #else static inline int page_pool_ethtool_stats_get_count(void) { return 0; } static inline u8 *page_pool_ethtool_stats_get_strings(u8 *data) { return data; } static inline u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats) { return data; } #endif /** * page_pool_dev_alloc_pages() - allocate a page. * @pool: pool from which to allocate * * Get a page from the page allocator or page_pool caches. */ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc_pages(pool, gfp); } /** * page_pool_dev_alloc_frag() - allocate a page fragment. * @pool: pool from which to allocate * @offset: offset to the allocated page * @size: requested size * * Get a page fragment from the page allocator or page_pool caches. * * Return: allocated page fragment, otherwise return NULL. */ static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, unsigned int *offset, unsigned int size) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc_frag(pool, offset, size, gfp); } static inline netmem_ref page_pool_alloc_netmem(struct page_pool *pool, unsigned int *offset, unsigned int *size, gfp_t gfp) { unsigned int max_size = PAGE_SIZE << pool->p.order; netmem_ref netmem; if ((*size << 1) > max_size) { *size = max_size; *offset = 0; return page_pool_alloc_netmems(pool, gfp); } netmem = page_pool_alloc_frag_netmem(pool, offset, *size, gfp); if (unlikely(!netmem)) return 0; /* There is very likely not enough space for another fragment, so append * the remaining size to the current fragment to avoid truesize * underestimate problem. */ if (pool->frag_offset + *size > max_size) { *size = max_size - *offset; pool->frag_offset = max_size; } return netmem; } static inline netmem_ref page_pool_dev_alloc_netmem(struct page_pool *pool, unsigned int *offset, unsigned int *size) { gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; return page_pool_alloc_netmem(pool, offset, size, gfp); } static inline netmem_ref page_pool_dev_alloc_netmems(struct page_pool *pool) { gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; return page_pool_alloc_netmems(pool, gfp); } static inline struct page *page_pool_alloc(struct page_pool *pool, unsigned int *offset, unsigned int *size, gfp_t gfp) { return netmem_to_page(page_pool_alloc_netmem(pool, offset, size, gfp)); } /** * page_pool_dev_alloc() - allocate a page or a page fragment. * @pool: pool from which to allocate * @offset: offset to the allocated page * @size: in as the requested size, out as the allocated size * * Get a page or a page fragment from the page allocator or page_pool caches * depending on the requested size in order to allocate memory with least memory * utilization and performance penalty. * * Return: allocated page or page fragment, otherwise return NULL. */ static inline struct page *page_pool_dev_alloc(struct page_pool *pool, unsigned int *offset, unsigned int *size) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc(pool, offset, size, gfp); } static inline void *page_pool_alloc_va(struct page_pool *pool, unsigned int *size, gfp_t gfp) { unsigned int offset; struct page *page; /* Mask off __GFP_HIGHMEM to ensure we can use page_address() */ page = page_pool_alloc(pool, &offset, size, gfp & ~__GFP_HIGHMEM); if (unlikely(!page)) return NULL; return page_address(page) + offset; } /** * page_pool_dev_alloc_va() - allocate a page or a page fragment and return its * va. * @pool: pool from which to allocate * @size: in as the requested size, out as the allocated size * * This is just a thin wrapper around the page_pool_alloc() API, and * it returns va of the allocated page or page fragment. * * Return: the va for the allocated page or page fragment, otherwise return NULL. */ static inline void *page_pool_dev_alloc_va(struct page_pool *pool, unsigned int *size) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc_va(pool, size, gfp); } /** * page_pool_get_dma_dir() - Retrieve the stored DMA direction. * @pool: pool from which page was allocated * * Get the stored dma direction. A driver might decide to store this locally * and avoid the extra cache line from page_pool to determine the direction. */ static inline enum dma_data_direction page_pool_get_dma_dir(const struct page_pool *pool) { return pool->p.dma_dir; } static inline void page_pool_fragment_netmem(netmem_ref netmem, long nr) { atomic_long_set(netmem_get_pp_ref_count_ref(netmem), nr); } /** * page_pool_fragment_page() - split a fresh page into fragments * @page: page to split * @nr: references to set * * pp_ref_count represents the number of outstanding references to the page, * which will be freed using page_pool APIs (rather than page allocator APIs * like put_page()). Such references are usually held by page_pool-aware * objects like skbs marked for page pool recycling. * * This helper allows the caller to take (set) multiple references to a * freshly allocated page. The page must be freshly allocated (have a * pp_ref_count of 1). This is commonly done by drivers and * "fragment allocators" to save atomic operations - either when they know * upfront how many references they will need; or to take MAX references and * return the unused ones with a single atomic dec(), instead of performing * multiple atomic inc() operations. */ static inline void page_pool_fragment_page(struct page *page, long nr) { page_pool_fragment_netmem(page_to_netmem(page), nr); } static inline long page_pool_unref_netmem(netmem_ref netmem, long nr) { atomic_long_t *pp_ref_count = netmem_get_pp_ref_count_ref(netmem); long ret; /* If nr == pp_ref_count then we have cleared all remaining * references to the page: * 1. 'n == 1': no need to actually overwrite it. * 2. 'n != 1': overwrite it with one, which is the rare case * for pp_ref_count draining. * * The main advantage to doing this is that not only we avoid a atomic * update, as an atomic_read is generally a much cheaper operation than * an atomic update, especially when dealing with a page that may be * referenced by only 2 or 3 users; but also unify the pp_ref_count * handling by ensuring all pages have partitioned into only 1 piece * initially, and only overwrite it when the page is partitioned into * more than one piece. */ if (atomic_long_read(pp_ref_count) == nr) { /* As we have ensured nr is always one for constant case using * the BUILD_BUG_ON(), only need to handle the non-constant case * here for pp_ref_count draining, which is a rare case. */ BUILD_BUG_ON(__builtin_constant_p(nr) && nr != 1); if (!__builtin_constant_p(nr)) atomic_long_set(pp_ref_count, 1); return 0; } ret = atomic_long_sub_return(nr, pp_ref_count); WARN_ON(ret < 0); /* We are the last user here too, reset pp_ref_count back to 1 to * ensure all pages have been partitioned into 1 piece initially, * this should be the rare case when the last two fragment users call * page_pool_unref_page() currently. */ if (unlikely(!ret)) atomic_long_set(pp_ref_count, 1); return ret; } static inline long page_pool_unref_page(struct page *page, long nr) { return page_pool_unref_netmem(page_to_netmem(page), nr); } static inline void page_pool_ref_netmem(netmem_ref netmem) { atomic_long_inc(netmem_get_pp_ref_count_ref(netmem)); } static inline void page_pool_ref_page(struct page *page) { page_pool_ref_netmem(page_to_netmem(page)); } static inline bool page_pool_unref_and_test(netmem_ref netmem) { /* If page_pool_unref_page() returns 0, we were the last user */ return page_pool_unref_netmem(netmem, 1) == 0; } static inline void page_pool_put_netmem(struct page_pool *pool, netmem_ref netmem, unsigned int dma_sync_size, bool allow_direct) { /* When page_pool isn't compiled-in, net/core/xdp.c doesn't * allow registering MEM_TYPE_PAGE_POOL, but shield linker. */ #ifdef CONFIG_PAGE_POOL if (!page_pool_unref_and_test(netmem)) return; page_pool_put_unrefed_netmem(pool, netmem, dma_sync_size, allow_direct); #endif } /** * page_pool_put_page() - release a reference to a page pool page * @pool: pool from which page was allocated * @page: page to release a reference on * @dma_sync_size: how much of the page may have been touched by the device * @allow_direct: released by the consumer, allow lockless caching * * The outcome of this depends on the page refcnt. If the driver bumps * the refcnt > 1 this will unmap the page. If the page refcnt is 1 * the allocator owns the page and will try to recycle it in one of the pool * caches. If PP_FLAG_DMA_SYNC_DEV is set, the page will be synced for_device * using dma_sync_single_range_for_device(). */ static inline void page_pool_put_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, bool allow_direct) { page_pool_put_netmem(pool, page_to_netmem(page), dma_sync_size, allow_direct); } static inline void page_pool_put_full_netmem(struct page_pool *pool, netmem_ref netmem, bool allow_direct) { page_pool_put_netmem(pool, netmem, -1, allow_direct); } /** * page_pool_put_full_page() - release a reference on a page pool page * @pool: pool from which page was allocated * @page: page to release a reference on * @allow_direct: released by the consumer, allow lockless caching * * Similar to page_pool_put_page(), but will DMA sync the entire memory area * as configured in &page_pool_params.max_len. */ static inline void page_pool_put_full_page(struct page_pool *pool, struct page *page, bool allow_direct) { page_pool_put_netmem(pool, page_to_netmem(page), -1, allow_direct); } /** * page_pool_recycle_direct() - release a reference on a page pool page * @pool: pool from which page was allocated * @page: page to release a reference on * * Similar to page_pool_put_full_page() but caller must guarantee safe context * (e.g NAPI), since it will recycle the page directly into the pool fast cache. */ static inline void page_pool_recycle_direct(struct page_pool *pool, struct page *page) { page_pool_put_full_page(pool, page, true); } static inline void page_pool_recycle_direct_netmem(struct page_pool *pool, netmem_ref netmem) { page_pool_put_full_netmem(pool, netmem, true); } #define PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA \ (sizeof(dma_addr_t) > sizeof(unsigned long)) /** * page_pool_free_va() - free a va into the page_pool * @pool: pool from which va was allocated * @va: va to be freed * @allow_direct: freed by the consumer, allow lockless caching * * Free a va allocated from page_pool_allo_va(). */ static inline void page_pool_free_va(struct page_pool *pool, void *va, bool allow_direct) { page_pool_put_page(pool, virt_to_head_page(va), -1, allow_direct); } static inline dma_addr_t page_pool_get_dma_addr_netmem(netmem_ref netmem) { dma_addr_t ret = netmem_get_dma_addr(netmem); if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) ret <<= PAGE_SHIFT; return ret; } /** * page_pool_get_dma_addr() - Retrieve the stored DMA address. * @page: page allocated from a page pool * * Fetch the DMA address of the page. The page pool to which the page belongs * must had been created with PP_FLAG_DMA_MAP. */ static inline dma_addr_t page_pool_get_dma_addr(const struct page *page) { return page_pool_get_dma_addr_netmem(page_to_netmem(page)); } static inline void __page_pool_dma_sync_for_cpu(const struct page_pool *pool, const dma_addr_t dma_addr, u32 offset, u32 dma_sync_size) { dma_sync_single_range_for_cpu(pool->p.dev, dma_addr, offset + pool->p.offset, dma_sync_size, page_pool_get_dma_dir(pool)); } /** * page_pool_dma_sync_for_cpu - sync Rx page for CPU after it's written by HW * @pool: &page_pool the @page belongs to * @page: page to sync * @offset: offset from page start to "hard" start if using PP frags * @dma_sync_size: size of the data written to the page * * Can be used as a shorthand to sync Rx pages before accessing them in the * driver. Caller must ensure the pool was created with ``PP_FLAG_DMA_MAP``. * Note that this version performs DMA sync unconditionally, even if the * associated PP doesn't perform sync-for-device. */ static inline void page_pool_dma_sync_for_cpu(const struct page_pool *pool, const struct page *page, u32 offset, u32 dma_sync_size) { __page_pool_dma_sync_for_cpu(pool, page_pool_get_dma_addr(page), offset, dma_sync_size); } static inline void page_pool_dma_sync_netmem_for_cpu(const struct page_pool *pool, const netmem_ref netmem, u32 offset, u32 dma_sync_size) { if (!pool->dma_sync_for_cpu) return; __page_pool_dma_sync_for_cpu(pool, page_pool_get_dma_addr_netmem(netmem), offset, dma_sync_size); } static inline void page_pool_get(struct page_pool *pool) { refcount_inc(&pool->user_cnt); } static inline bool page_pool_put(struct page_pool *pool) { return refcount_dec_and_test(&pool->user_cnt); } static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) { if (unlikely(pool->p.nid != new_nid)) page_pool_update_nid(pool, new_nid); } /** * page_pool_is_unreadable() - will allocated buffers be unreadable for the CPU * @pool: queried page pool * * Check if page pool will return buffers which are unreadable to the CPU / * kernel. This will only be the case if user space bound a memory provider (mp) * which returns unreadable memory to the queue served by the page pool. * If %PP_FLAG_ALLOW_UNREADABLE_NETMEM was set but there is no mp bound * this helper will return false. See also netif_rxq_has_unreadable_mp(). * * Return: true if memory allocated by the page pool may be unreadable */ static inline bool page_pool_is_unreadable(struct page_pool *pool) { return !!pool->mp_ops; } #endif /* _NET_PAGE_POOL_HELPERS_H */
96 9 24 32 27 137 2 25 33 102 28 28 59 53 7 25 8 36 59 38 22 37 22 14 9 2 24 93 2 130 43 210 210 205 211 2 4 20 178 13 13 13 13 1 12 36 37 1 23 16 49 10 42 10 10 22 60 55 54 6 3 10 15 1 26 52 5 5 2 27 45 72 5 12 60 9 19 37 19 37 6 48 41 11 3 7 255 3 254 73 190 25 21 23 20 20 70 3 172 8 238 193 237 172 1 232 5 52 191 232 21 21 4 8 10 6 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/file.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/file.c * * Copyright (C) 1991, 1992 Linus Torvalds * * ext4 fs regular file handling primitives * * 64-bit file support on 64-bit platforms by Jakub Jelinek * (jj@sunsite.ms.mff.cuni.cz) */ #include <linux/time.h> #include <linux/fs.h> #include <linux/iomap.h> #include <linux/mount.h> #include <linux/path.h> #include <linux/dax.h> #include <linux/quotaops.h> #include <linux/pagevec.h> #include <linux/uio.h> #include <linux/mman.h> #include <linux/backing-dev.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include "truncate.h" /* * Returns %true if the given DIO request should be attempted with DIO, or * %false if it should fall back to buffered I/O. * * DIO isn't well specified; when it's unsupported (either due to the request * being misaligned, or due to the file not supporting DIO at all), filesystems * either fall back to buffered I/O or return EINVAL. For files that don't use * any special features like encryption or verity, ext4 has traditionally * returned EINVAL for misaligned DIO. iomap_dio_rw() uses this convention too. * In this case, we should attempt the DIO, *not* fall back to buffered I/O. * * In contrast, in cases where DIO is unsupported due to ext4 features, ext4 * traditionally falls back to buffered I/O. * * This function implements the traditional ext4 behavior in all these cases. */ static bool ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter) { struct inode *inode = file_inode(iocb->ki_filp); u32 dio_align = ext4_dio_alignment(inode); if (dio_align == 0) return false; if (dio_align == 1) return true; return IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), dio_align); } static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) { ssize_t ret; struct inode *inode = file_inode(iocb->ki_filp); if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock_shared(inode)) return -EAGAIN; } else { inode_lock_shared(inode); } if (!ext4_should_use_dio(iocb, to)) { inode_unlock_shared(inode); /* * Fallback to buffered I/O if the operation being performed on * the inode is not supported by direct I/O. The IOCB_DIRECT * flag needs to be cleared here in order to ensure that the * direct I/O path within generic_file_read_iter() is not * taken. */ iocb->ki_flags &= ~IOCB_DIRECT; return generic_file_read_iter(iocb, to); } ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, NULL, 0); inode_unlock_shared(inode); file_accessed(iocb->ki_filp); return ret; } #ifdef CONFIG_FS_DAX static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock_shared(inode)) return -EAGAIN; } else { inode_lock_shared(inode); } /* * Recheck under inode lock - at this point we are sure it cannot * change anymore */ if (!IS_DAX(inode)) { inode_unlock_shared(inode); /* Fallback to buffered IO in case we cannot support DAX */ return generic_file_read_iter(iocb, to); } ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops); inode_unlock_shared(inode); file_accessed(iocb->ki_filp); return ret; } #endif static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (!iov_iter_count(to)) return 0; /* skip atime */ #ifdef CONFIG_FS_DAX if (IS_DAX(inode)) return ext4_dax_read_iter(iocb, to); #endif if (iocb->ki_flags & IOCB_DIRECT) return ext4_dio_read_iter(iocb, to); return generic_file_read_iter(iocb, to); } static ssize_t ext4_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct inode *inode = file_inode(in); if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; return filemap_splice_read(in, ppos, pipe, len, flags); } /* * Called when an inode is released. Note that this is different * from ext4_file_open: open gets called at every open, but release * gets called only when /all/ the files are closed. */ static int ext4_release_file(struct inode *inode, struct file *filp) { if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) { ext4_alloc_da_blocks(inode); ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); } /* if we are the last writer on the inode, drop the block reservation */ if ((filp->f_mode & FMODE_WRITE) && (atomic_read(&inode->i_writecount) == 1) && !EXT4_I(inode)->i_reserved_data_blocks) { down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); } if (is_dx(inode) && filp->private_data) ext4_htree_free_dir_info(filp->private_data); return 0; } /* * This tests whether the IO in question is block-aligned or not. * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they * are converted to written only after the IO is complete. Until they are * mapped, these blocks appear as holes, so dio_zero_block() will assume that * it needs to zero out portions of the start and/or end block. If 2 AIO * threads are at work on the same unwritten block, they must be synchronized * or one thread will zero the other's data, causing corruption. */ static bool ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos) { struct super_block *sb = inode->i_sb; unsigned long blockmask = sb->s_blocksize - 1; if ((pos | iov_iter_alignment(from)) & blockmask) return true; return false; } static bool ext4_extending_io(struct inode *inode, loff_t offset, size_t len) { if (offset + len > i_size_read(inode) || offset + len > EXT4_I(inode)->i_disksize) return true; return false; } /* Is IO overwriting allocated or initialized blocks? */ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len, bool *unwritten) { struct ext4_map_blocks map; unsigned int blkbits = inode->i_blkbits; int err, blklen; if (pos + len > i_size_read(inode)) return false; map.m_lblk = pos >> blkbits; map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits); blklen = map.m_len; err = ext4_map_blocks(NULL, inode, &map, 0); if (err != blklen) return false; /* * 'err==len' means that all of the blocks have been preallocated, * regardless of whether they have been initialized or not. We need to * check m_flags to distinguish the unwritten extents. */ *unwritten = !(map.m_flags & EXT4_MAP_MAPPED); return true; } static ssize_t ext4_generic_write_checks(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; ret = generic_write_checks(iocb, from); if (ret <= 0) return ret; /* * If we have encountered a bitmap-format file, the size limit * is smaller than s_maxbytes, which is for extent-mapped files. */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) return -EFBIG; iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); } return iov_iter_count(from); } static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret, count; count = ext4_generic_write_checks(iocb, from); if (count <= 0) return count; ret = file_modified(iocb->ki_filp); if (ret) return ret; return count; } static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret; struct inode *inode = file_inode(iocb->ki_filp); if (iocb->ki_flags & IOCB_NOWAIT) return -EOPNOTSUPP; inode_lock(inode); ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; ret = generic_perform_write(iocb, from); out: inode_unlock(inode); if (unlikely(ret <= 0)) return ret; return generic_write_sync(iocb, ret); } static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, ssize_t written, ssize_t count) { handle_t *handle; lockdep_assert_held_write(&inode->i_rwsem); handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) return PTR_ERR(handle); if (ext4_update_inode_size(inode, offset + written)) { int ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) { ext4_journal_stop(handle); return ret; } } if ((written == count) && inode->i_nlink) ext4_orphan_del(handle, inode); ext4_journal_stop(handle); return written; } /* * Clean up the inode after DIO or DAX extending write has completed and the * inode size has been updated using ext4_handle_inode_extension(). */ static void ext4_inode_extension_cleanup(struct inode *inode, bool need_trunc) { lockdep_assert_held_write(&inode->i_rwsem); if (need_trunc) { ext4_truncate_failed_write(inode); /* * If the truncate operation failed early, then the inode may * still be on the orphan list. In that case, we need to try * remove the inode from the in-memory linked list. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); return; } /* * If i_disksize got extended either due to writeback of delalloc * blocks or extending truncate while the DIO was running we could fail * to cleanup the orphan list in ext4_handle_inode_extension(). Do it * now. */ if (ext4_inode_orphan_tracked(inode) && inode->i_nlink) { handle_t *handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { /* * The write has successfully completed. Not much to * do with the error here so just cleanup the orphan * list and hope for the best. */ ext4_orphan_del(NULL, inode); return; } ext4_orphan_del(handle, inode); ext4_journal_stop(handle); } } static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error, unsigned int flags) { loff_t pos = iocb->ki_pos; struct inode *inode = file_inode(iocb->ki_filp); if (!error && size && (flags & IOMAP_DIO_UNWRITTEN) && (iocb->ki_flags & IOCB_ATOMIC)) error = ext4_convert_unwritten_extents_atomic(NULL, inode, pos, size); else if (!error && size && flags & IOMAP_DIO_UNWRITTEN) error = ext4_convert_unwritten_extents(NULL, inode, pos, size); if (error) return error; /* * Note that EXT4_I(inode)->i_disksize can get extended up to * inode->i_size while the I/O was running due to writeback of delalloc * blocks. But the code in ext4_iomap_alloc() is careful to use * zeroed/unwritten extents if this is possible; thus we won't leave * uninitialized blocks in a file even if we didn't succeed in writing * as much as we intended. Also we can race with truncate or write * expanding the file so we have to be a bit careful here. */ if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize) && pos + size <= i_size_read(inode)) return 0; error = ext4_handle_inode_extension(inode, pos, size, size); return error < 0 ? error : 0; } static const struct iomap_dio_ops ext4_dio_write_ops = { .end_io = ext4_dio_write_end_io, }; /* * The intention here is to start with shared lock acquired then see if any * condition requires an exclusive inode lock. If yes, then we restart the * whole operation by releasing the shared lock and acquiring exclusive lock. * * - For unaligned_io we never take shared lock as it may cause data corruption * when two unaligned IO tries to modify the same block e.g. while zeroing. * * - For extending writes case we don't take the shared lock, since it requires * updating inode i_disksize and/or orphan handling with exclusive lock. * * - shared locking will only be true mostly with overwrites, including * initialized blocks and unwritten blocks. For overwrite unwritten blocks * we protect splitting extents by i_data_sem in ext4_inode_info, so we can * also release exclusive i_rwsem lock. * * - Otherwise we will switch to exclusive i_rwsem lock. */ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, bool *ilock_shared, bool *extend, bool *unwritten, int *dio_flags) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); loff_t offset; size_t count; ssize_t ret; bool overwrite, unaligned_io; restart: ret = ext4_generic_write_checks(iocb, from); if (ret <= 0) goto out; offset = iocb->ki_pos; count = ret; unaligned_io = ext4_unaligned_io(inode, from, offset); *extend = ext4_extending_io(inode, offset, count); overwrite = ext4_overwrite_io(inode, offset, count, unwritten); /* * Determine whether we need to upgrade to an exclusive lock. This is * required to change security info in file_modified(), for extending * I/O, any form of non-overwrite I/O, and unaligned I/O to unwritten * extents (as partial block zeroing may be required). * * Note that unaligned writes are allowed under shared lock so long as * they are pure overwrites. Otherwise, concurrent unaligned writes risk * data corruption due to partial block zeroing in the dio layer, and so * the I/O must occur exclusively. */ if (*ilock_shared && ((!IS_NOSEC(inode) || *extend || !overwrite || (unaligned_io && *unwritten)))) { if (iocb->ki_flags & IOCB_NOWAIT) { ret = -EAGAIN; goto out; } inode_unlock_shared(inode); *ilock_shared = false; inode_lock(inode); goto restart; } /* * Now that locking is settled, determine dio flags and exclusivity * requirements. We don't use DIO_OVERWRITE_ONLY because we enforce * behavior already. The inode lock is already held exclusive if the * write is non-overwrite or extending, so drain all outstanding dio and * set the force wait dio flag. */ if (!*ilock_shared && (unaligned_io || *extend)) { if (iocb->ki_flags & IOCB_NOWAIT) { ret = -EAGAIN; goto out; } if (unaligned_io && (!overwrite || *unwritten)) inode_dio_wait(inode); *dio_flags = IOMAP_DIO_FORCE_WAIT; } ret = file_modified(file); if (ret < 0) goto out; return count; out: if (*ilock_shared) inode_unlock_shared(inode); else inode_unlock(inode); return ret; } static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret; handle_t *handle; struct inode *inode = file_inode(iocb->ki_filp); loff_t offset = iocb->ki_pos; size_t count = iov_iter_count(from); const struct iomap_ops *iomap_ops = &ext4_iomap_ops; bool extend = false, unwritten = false; bool ilock_shared = true; int dio_flags = 0; /* * Quick check here without any i_rwsem lock to see if it is extending * IO. A more reliable check is done in ext4_dio_write_checks() with * proper locking in place. */ if (offset + count > i_size_read(inode)) ilock_shared = false; if (iocb->ki_flags & IOCB_NOWAIT) { if (ilock_shared) { if (!inode_trylock_shared(inode)) return -EAGAIN; } else { if (!inode_trylock(inode)) return -EAGAIN; } } else { if (ilock_shared) inode_lock_shared(inode); else inode_lock(inode); } /* Fallback to buffered I/O if the inode does not support direct I/O. */ if (!ext4_should_use_dio(iocb, from)) { if (ilock_shared) inode_unlock_shared(inode); else inode_unlock(inode); return ext4_buffered_write_iter(iocb, from); } /* * Prevent inline data from being created since we are going to allocate * blocks for DIO. We know the inode does not currently have inline data * because ext4_should_use_dio() checked for it, but we have to clear * the state flag before the write checks because a lock cycle could * introduce races with other writers. */ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend, &unwritten, &dio_flags); if (ret <= 0) return ret; offset = iocb->ki_pos; count = ret; if (extend) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; } ret = ext4_orphan_add(handle, inode); ext4_journal_stop(handle); if (ret) goto out; } if (ilock_shared && !unwritten) iomap_ops = &ext4_iomap_overwrite_ops; ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, dio_flags, NULL, 0); if (ret == -ENOTBLK) ret = 0; if (extend) { /* * We always perform extending DIO write synchronously so by * now the IO is completed and ext4_handle_inode_extension() * was called. Cleanup the inode in case of error or race with * writeback of delalloc blocks. */ WARN_ON_ONCE(ret == -EIOCBQUEUED); ext4_inode_extension_cleanup(inode, ret < 0); } out: if (ilock_shared) inode_unlock_shared(inode); else inode_unlock(inode); if (ret >= 0 && iov_iter_count(from)) { ssize_t err; loff_t endbyte; /* * There is no support for atomic writes on buffered-io yet, * we should never fallback to buffered-io for DIO atomic * writes. */ WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC); offset = iocb->ki_pos; err = ext4_buffered_write_iter(iocb, from); if (err < 0) return err; /* * We need to ensure that the pages within the page cache for * the range covered by this I/O are written to disk and * invalidated. This is in attempt to preserve the expected * direct I/O semantics in the case we fallback to buffered I/O * to complete off the I/O request. */ ret += err; endbyte = offset + err - 1; err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping, offset, endbyte); if (!err) invalidate_mapping_pages(iocb->ki_filp->f_mapping, offset >> PAGE_SHIFT, endbyte >> PAGE_SHIFT); } return ret; } #ifdef CONFIG_FS_DAX static ssize_t ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret; size_t count; loff_t offset; handle_t *handle; bool extend = false; struct inode *inode = file_inode(iocb->ki_filp); if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock(inode)) return -EAGAIN; } else { inode_lock(inode); } ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; offset = iocb->ki_pos; count = iov_iter_count(from); if (offset + count > EXT4_I(inode)->i_disksize) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; } ret = ext4_orphan_add(handle, inode); if (ret) { ext4_journal_stop(handle); goto out; } extend = true; ext4_journal_stop(handle); } ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); if (extend) { ret = ext4_handle_inode_extension(inode, offset, ret, count); ext4_inode_extension_cleanup(inode, ret < (ssize_t)count); } out: inode_unlock(inode); if (ret > 0) ret = generic_write_sync(iocb, ret); return ret; } #endif static ssize_t ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { int ret; struct inode *inode = file_inode(iocb->ki_filp); ret = ext4_emergency_state(inode->i_sb); if (unlikely(ret)) return ret; #ifdef CONFIG_FS_DAX if (IS_DAX(inode)) return ext4_dax_write_iter(iocb, from); #endif if (iocb->ki_flags & IOCB_ATOMIC) { size_t len = iov_iter_count(from); if (len < EXT4_SB(inode->i_sb)->s_awu_min || len > EXT4_SB(inode->i_sb)->s_awu_max) return -EINVAL; ret = generic_atomic_write_valid(iocb, from); if (ret) return ret; } if (iocb->ki_flags & IOCB_DIRECT) return ext4_dio_write_iter(iocb, from); else return ext4_buffered_write_iter(iocb, from); } #ifdef CONFIG_FS_DAX static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, unsigned int order) { int error = 0; vm_fault_t result; int retries = 0; handle_t *handle = NULL; struct inode *inode = file_inode(vmf->vma->vm_file); struct super_block *sb = inode->i_sb; /* * We have to distinguish real writes from writes which will result in a * COW page; COW writes should *not* poke the journal (the file will not * be changed). Doing so would cause unintended failures when mounted * read-only. * * We check for VM_SHARED rather than vmf->cow_page since the latter is * unset for order != 0 (i.e. only in do_cow_fault); for * other sizes, dax_iomap_fault will handle splitting / fallback so that * we eventually come back with a COW page. */ bool write = (vmf->flags & FAULT_FLAG_WRITE) && (vmf->vma->vm_flags & VM_SHARED); struct address_space *mapping = vmf->vma->vm_file->f_mapping; unsigned long pfn; if (write) { sb_start_pagefault(sb); file_update_time(vmf->vma->vm_file); filemap_invalidate_lock_shared(mapping); retry: handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, EXT4_DATA_TRANS_BLOCKS(sb)); if (IS_ERR(handle)) { filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(sb); return VM_FAULT_SIGBUS; } } else { filemap_invalidate_lock_shared(mapping); } result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops); if (write) { ext4_journal_stop(handle); if ((result & VM_FAULT_ERROR) && error == -ENOSPC && ext4_should_retry_alloc(sb, &retries)) goto retry; /* Handling synchronous page fault? */ if (result & VM_FAULT_NEEDDSYNC) result = dax_finish_sync_fault(vmf, order, pfn); filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(sb); } else { filemap_invalidate_unlock_shared(mapping); } return result; } static vm_fault_t ext4_dax_fault(struct vm_fault *vmf) { return ext4_dax_huge_fault(vmf, 0); } static const struct vm_operations_struct ext4_dax_vm_ops = { .fault = ext4_dax_fault, .huge_fault = ext4_dax_huge_fault, .page_mkwrite = ext4_dax_fault, .pfn_mkwrite = ext4_dax_fault, }; #else #define ext4_dax_vm_ops ext4_file_vm_ops #endif static const struct vm_operations_struct ext4_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = ext4_page_mkwrite, }; static int ext4_file_mmap_prepare(struct vm_area_desc *desc) { int ret; struct file *file = desc->file; struct inode *inode = file->f_mapping->host; struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; if (file->f_mode & FMODE_WRITE) ret = ext4_emergency_state(inode->i_sb); else ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0; if (unlikely(ret)) return ret; /* * We don't support synchronous mappings for non-DAX files and * for DAX files if underneath dax_device is not synchronous. */ if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file), dax_dev)) return -EOPNOTSUPP; file_accessed(file); if (IS_DAX(file_inode(file))) { desc->vm_ops = &ext4_dax_vm_ops; desc->vm_flags |= VM_HUGEPAGE; } else { desc->vm_ops = &ext4_file_vm_ops; } return 0; } static int ext4_sample_last_mounted(struct super_block *sb, struct vfsmount *mnt) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct path path; char buf[64], *cp; handle_t *handle; int err; if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED))) return 0; if (ext4_emergency_state(sb) || sb_rdonly(sb) || !sb_start_intwrite_trylock(sb)) return 0; ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED); /* * Sample where the filesystem has been mounted and * store it in the superblock for sysadmin convenience * when trying to sort through large numbers of block * devices or filesystem images. */ memset(buf, 0, sizeof(buf)); path.mnt = mnt; path.dentry = mnt->mnt_root; cp = d_path(&path, buf, sizeof(buf)); err = 0; if (IS_ERR(cp)) goto out; handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); err = PTR_ERR(handle); if (IS_ERR(handle)) goto out; BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, EXT4_JTR_NONE); if (err) goto out_journal; lock_buffer(sbi->s_sbh); strtomem_pad(sbi->s_es->s_last_mounted, cp, 0); ext4_superblock_csum_set(sb); unlock_buffer(sbi->s_sbh); ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); out_journal: ext4_journal_stop(handle); out: sb_end_intwrite(sb); return err; } static int ext4_file_open(struct inode *inode, struct file *filp) { int ret; if (filp->f_mode & FMODE_WRITE) ret = ext4_emergency_state(inode->i_sb); else ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0; if (unlikely(ret)) return ret; ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt); if (ret) return ret; ret = fscrypt_file_open(inode, filp); if (ret) return ret; ret = fsverity_file_open(inode, filp); if (ret) return ret; /* * Set up the jbd2_inode if we are opening the inode for * writing and the journal is present */ if (filp->f_mode & FMODE_WRITE) { ret = ext4_inode_attach_jinode(inode); if (ret < 0) return ret; } if (ext4_inode_can_atomic_write(inode)) filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; return dquot_file_open(inode, filp); } /* * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values * by calling generic_file_llseek_size() with the appropriate maxbytes * value for each. */ loff_t ext4_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; loff_t maxbytes = ext4_get_maxbytes(inode); switch (whence) { default: return generic_file_llseek_size(file, offset, whence, maxbytes, i_size_read(inode)); case SEEK_HOLE: inode_lock_shared(inode); offset = iomap_seek_hole(inode, offset, &ext4_iomap_report_ops); inode_unlock_shared(inode); break; case SEEK_DATA: inode_lock_shared(inode); offset = iomap_seek_data(inode, offset, &ext4_iomap_report_ops); inode_unlock_shared(inode); break; } if (offset < 0) return offset; return vfs_setpos(file, offset, maxbytes); } const struct file_operations ext4_file_operations = { .llseek = ext4_llseek, .read_iter = ext4_file_read_iter, .write_iter = ext4_file_write_iter, .iopoll = iocb_bio_iopoll, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, #endif .mmap_prepare = ext4_file_mmap_prepare, .open = ext4_file_open, .release = ext4_release_file, .fsync = ext4_sync_file, .get_unmapped_area = thp_get_unmapped_area, .splice_read = ext4_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ext4_fallocate, .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | FOP_DIO_PARALLEL_WRITE | FOP_DONTCACHE, }; const struct inode_operations ext4_file_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_file_getattr, .listxattr = ext4_listxattr, .get_inode_acl = ext4_get_acl, .set_acl = ext4_set_acl, .fiemap = ext4_fiemap, .fileattr_get = ext4_fileattr_get, .fileattr_set = ext4_fileattr_set, };
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 /* * net/tipc/ib_media.c: Infiniband bearer support for TIPC * * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> * * Based on eth_media.c, which carries the following copyright notice: * * Copyright (c) 2001-2007, Ericsson AB * Copyright (c) 2005-2008, 2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <linux/if_infiniband.h> #include "core.h" #include "bearer.h" #define TIPC_MAX_IB_LINK_WIN 500 /* convert InfiniBand address (media address format) media address to string */ static int tipc_ib_addr2str(struct tipc_media_addr *a, char *str_buf, int str_size) { if (str_size < 60) /* 60 = 19 * strlen("xx:") + strlen("xx\0") */ return 1; sprintf(str_buf, "%20phC", a->value); return 0; } /* Convert from media address format to discovery message addr format */ static int tipc_ib_addr2msg(char *msg, struct tipc_media_addr *addr) { memset(msg, 0, TIPC_MEDIA_INFO_SIZE); memcpy(msg, addr->value, INFINIBAND_ALEN); return 0; } /* Convert raw InfiniBand address format to media addr format */ static int tipc_ib_raw2addr(struct tipc_bearer *b, struct tipc_media_addr *addr, const char *msg) { memset(addr, 0, sizeof(*addr)); memcpy(addr->value, msg, INFINIBAND_ALEN); addr->media_id = TIPC_MEDIA_TYPE_IB; addr->broadcast = !memcmp(msg, b->bcast_addr.value, INFINIBAND_ALEN); return 0; } /* Convert discovery msg addr format to InfiniBand media addr format */ static int tipc_ib_msg2addr(struct tipc_bearer *b, struct tipc_media_addr *addr, char *msg) { return tipc_ib_raw2addr(b, addr, msg); } /* InfiniBand media registration info */ struct tipc_media ib_media_info = { .send_msg = tipc_l2_send_msg, .enable_media = tipc_enable_l2_media, .disable_media = tipc_disable_l2_media, .addr2str = tipc_ib_addr2str, .addr2msg = tipc_ib_addr2msg, .msg2addr = tipc_ib_msg2addr, .raw2addr = tipc_ib_raw2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, .min_win = TIPC_DEF_LINK_WIN, .max_win = TIPC_MAX_IB_LINK_WIN, .type_id = TIPC_MEDIA_TYPE_IB, .hwaddr_len = INFINIBAND_ALEN, .name = "ib" };
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Public Key Signature Algorithm * * Copyright (c) 2023 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_SIG_H #define _CRYPTO_SIG_H #include <linux/crypto.h> /** * struct crypto_sig - user-instantiated objects which encapsulate * algorithms and core processing logic * * @base: Common crypto API algorithm data structure */ struct crypto_sig { struct crypto_tfm base; }; /** * struct sig_alg - generic public key signature algorithm * * @sign: Function performs a sign operation as defined by public key * algorithm. On success, the signature size is returned. * Optional. * @verify: Function performs a complete verify operation as defined by * public key algorithm, returning verification status. Optional. * @set_pub_key: Function invokes the algorithm specific set public key * function, which knows how to decode and interpret * the BER encoded public key and parameters. Mandatory. * @set_priv_key: Function invokes the algorithm specific set private key * function, which knows how to decode and interpret * the BER encoded private key and parameters. Optional. * @key_size: Function returns key size. Mandatory. * @digest_size: Function returns maximum digest size. Optional. * @max_size: Function returns maximum signature size. Optional. * @init: Initialize the cryptographic transformation object. * This function is used to initialize the cryptographic * transformation object. This function is called only once at * the instantiation time, right after the transformation context * was allocated. In case the cryptographic hardware has some * special requirements which need to be handled by software, this * function shall check for the precise requirement of the * transformation and put any software fallbacks in place. * @exit: Deinitialize the cryptographic transformation object. This is a * counterpart to @init, used to remove various changes set in * @init. * * @base: Common crypto API algorithm data structure */ struct sig_alg { int (*sign)(struct crypto_sig *tfm, const void *src, unsigned int slen, void *dst, unsigned int dlen); int (*verify)(struct crypto_sig *tfm, const void *src, unsigned int slen, const void *digest, unsigned int dlen); int (*set_pub_key)(struct crypto_sig *tfm, const void *key, unsigned int keylen); int (*set_priv_key)(struct crypto_sig *tfm, const void *key, unsigned int keylen); unsigned int (*key_size)(struct crypto_sig *tfm); unsigned int (*digest_size)(struct crypto_sig *tfm); unsigned int (*max_size)(struct crypto_sig *tfm); int (*init)(struct crypto_sig *tfm); void (*exit)(struct crypto_sig *tfm); struct crypto_alg base; }; /** * DOC: Generic Public Key Signature API * * The Public Key Signature API is used with the algorithms of type * CRYPTO_ALG_TYPE_SIG (listed as type "sig" in /proc/crypto) */ /** * crypto_alloc_sig() - allocate signature tfm handle * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * signing algorithm e.g. "ecdsa" * @type: specifies the type of the algorithm * @mask: specifies the mask for the algorithm * * Allocate a handle for public key signature algorithm. The returned struct * crypto_sig is the handle that is required for any subsequent * API invocation for signature operations. * * Return: allocated handle in case of success; IS_ERR() is true in case * of an error, PTR_ERR() returns the error code. */ struct crypto_sig *crypto_alloc_sig(const char *alg_name, u32 type, u32 mask); static inline struct crypto_tfm *crypto_sig_tfm(struct crypto_sig *tfm) { return &tfm->base; } static inline struct crypto_sig *__crypto_sig_tfm(struct crypto_tfm *tfm) { return container_of(tfm, struct crypto_sig, base); } static inline struct sig_alg *__crypto_sig_alg(struct crypto_alg *alg) { return container_of(alg, struct sig_alg, base); } static inline struct sig_alg *crypto_sig_alg(struct crypto_sig *tfm) { return __crypto_sig_alg(crypto_sig_tfm(tfm)->__crt_alg); } /** * crypto_free_sig() - free signature tfm handle * * @tfm: signature tfm handle allocated with crypto_alloc_sig() * * If @tfm is a NULL or error pointer, this function does nothing. */ static inline void crypto_free_sig(struct crypto_sig *tfm) { crypto_destroy_tfm(tfm, crypto_sig_tfm(tfm)); } /** * crypto_sig_keysize() - Get key size * * Function returns the key size in bits. * Function assumes that the key is already set in the transformation. If this * function is called without a setkey or with a failed setkey, you may end up * in a NULL dereference. * * @tfm: signature tfm handle allocated with crypto_alloc_sig() */ static inline unsigned int crypto_sig_keysize(struct crypto_sig *tfm) { struct sig_alg *alg = crypto_sig_alg(tfm); return alg->key_size(tfm); } /** * crypto_sig_digestsize() - Get maximum digest size * * Function returns the maximum digest size in bytes. * Function assumes that the key is already set in the transformation. If this * function is called without a setkey or with a failed setkey, you may end up * in a NULL dereference. * * @tfm: signature tfm handle allocated with crypto_alloc_sig() */ static inline unsigned int crypto_sig_digestsize(struct crypto_sig *tfm) { struct sig_alg *alg = crypto_sig_alg(tfm); return alg->digest_size(tfm); } /** * crypto_sig_maxsize() - Get maximum signature size * * Function returns the maximum signature size in bytes. * Function assumes that the key is already set in the transformation. If this * function is called without a setkey or with a failed setkey, you may end up * in a NULL dereference. * * @tfm: signature tfm handle allocated with crypto_alloc_sig() */ static inline unsigned int crypto_sig_maxsize(struct crypto_sig *tfm) { struct sig_alg *alg = crypto_sig_alg(tfm); return alg->max_size(tfm); } /** * crypto_sig_sign() - Invoke signing operation * * Function invokes the specific signing operation for a given algorithm * * @tfm: signature tfm handle allocated with crypto_alloc_sig() * @src: source buffer * @slen: source length * @dst: destination obuffer * @dlen: destination length * * Return: signature size on success; error code in case of error */ static inline int crypto_sig_sign(struct crypto_sig *tfm, const void *src, unsigned int slen, void *dst, unsigned int dlen) { struct sig_alg *alg = crypto_sig_alg(tfm); return alg->sign(tfm, src, slen, dst, dlen); } /** * crypto_sig_verify() - Invoke signature verification * * Function invokes the specific signature verification operation * for a given algorithm. * * @tfm: signature tfm handle allocated with crypto_alloc_sig() * @src: source buffer * @slen: source length * @digest: digest * @dlen: digest length * * Return: zero on verification success; error code in case of error. */ static inline int crypto_sig_verify(struct crypto_sig *tfm, const void *src, unsigned int slen, const void *digest, unsigned int dlen) { struct sig_alg *alg = crypto_sig_alg(tfm); return alg->verify(tfm, src, slen, digest, dlen); } /** * crypto_sig_set_pubkey() - Invoke set public key operation * * Function invokes the algorithm specific set key function, which knows * how to decode and interpret the encoded key and parameters * * @tfm: tfm handle * @key: BER encoded public key, algo OID, paramlen, BER encoded * parameters * @keylen: length of the key (not including other data) * * Return: zero on success; error code in case of error */ static inline int crypto_sig_set_pubkey(struct crypto_sig *tfm, const void *key, unsigned int keylen) { struct sig_alg *alg = crypto_sig_alg(tfm); return alg->set_pub_key(tfm, key, keylen); } /** * crypto_sig_set_privkey() - Invoke set private key operation * * Function invokes the algorithm specific set key function, which knows * how to decode and interpret the encoded key and parameters * * @tfm: tfm handle * @key: BER encoded private key, algo OID, paramlen, BER encoded * parameters * @keylen: length of the key (not including other data) * * Return: zero on success; error code in case of error */ static inline int crypto_sig_set_privkey(struct crypto_sig *tfm, const void *key, unsigned int keylen) { struct sig_alg *alg = crypto_sig_alg(tfm); return alg->set_priv_key(tfm, key, keylen); } #endif
34 34 34 2 32 34 34 32 33 2 34 2 3 3 3 3 1 2 107 103 4 4 4 4 4 4 4 1 3 4 4 4 226 1 226 227 64 162 19 19 19 20 1 19 19 2 2 2 191 2 189 186 146 5 18 25 5 76 76 171 170 14 14 13 1 1 1 6 6 6 6 6 6 112 112 7 1 6 40 40 7 40 40 32 32 106 8 14 26 12 9 19 9 17 17 5 5 5 5 5 5 5 32 30 8 18 32 32 6 33 33 9 1 9 19 27 19 6 31 8 3 1 2 6 32 33 33 33 33 33 7 7 7 7 7 106 21 26 31 37 37 33 33 2 33 2 33 33 33 33 35 35 33 32 30 2 32 34 34 34 2 4 34 34 34 34 34 77 77 77 16 60 2 32 34 34 34 77 187 188 149 188 159 29 188 187 1 37 88 108 179 7 112 112 112 94 105 67 108 77 68 68 62 4 1 3 65 64 4 68 87 19 24 81 81 23 1 104 1 111 112 112 106 68 111 111 1 53 53 52 40 16 16 16 16 20 20 8 8 76 76 179 76 1 1 75 83 7 7 3 1 7 7 7 7 7 7 1 7 1 3 4 7 185 186 4 3 1 7 4 4 7 7 7 155 156 157 76 75 1 76 76 75 76 74 76 75 76 2 2 2 2 3 3 1 4 4 1 4 4 4 2 2 7 7 7 6 5 5 2 4 1 1 2 14 2 1 11 140 75 12 52 13 55 3 15 8 45 6 2 48 1 49 48 41 3 8 46 6 104 100 3 19 78 100 5 11 106 106 106 106 103 95 11 106 106 105 3 3 3 3 3 105 106 55 73 73 55 13 97 97 100 100 97 94 2 97 53 95 2 76 76 76 55 64 63 14 14 13 1 14 14 2 2 2 2 2 2 2 2 2 2 2 2 2 252 252 26 79 79 5 1 10 10 178 179 179 179 179 179 179 179 179 179 178 179 178 179 179 179 76 76 76 76 75 76 35 76 76 35 285 281 42 275 33 1 17 177 34 34 33 34 33 34 34 34 33 33 33 33 77 44 34 1 33 33 34 1 33 34 34 34 34 34 34 75 2 77 188 187 186 185 184 28 156 184 175 8 8 183 183 182 182 182 181 180 180 180 179 179 179 179 177 177 177 177 28 129 20 169 8 1 177 176 1 176 2 1 27 129 20 168 8 169 8 8 8 4 1 172 172 172 172 172 172 172 172 170 169 161 8 8 8 8 170 169 168 172 172 171 172 162 8 8 170 170 168 168 168 168 167 157 157 256 256 2 172 165 165 165 157 8 8 8 8 165 1 164 165 164 165 165 190 190 24 165 189 1 188 188 182 3 5 4 3 163 17 21 10 28 7 31 2 1 35 39 1 38 38 38 38 38 38 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 // SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/segment.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ */ #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/sched/mm.h> #include <linux/prefetch.h> #include <linux/kthread.h> #include <linux/swap.h> #include <linux/timer.h> #include <linux/freezer.h> #include <linux/sched/signal.h> #include <linux/random.h> #include "f2fs.h" #include "segment.h" #include "node.h" #include "gc.h" #include "iostat.h" #include <trace/events/f2fs.h> #define __reverse_ffz(x) __reverse_ffs(~(x)) static struct kmem_cache *discard_entry_slab; static struct kmem_cache *discard_cmd_slab; static struct kmem_cache *sit_entry_set_slab; static struct kmem_cache *revoke_entry_slab; static unsigned long __reverse_ulong(unsigned char *str) { unsigned long tmp = 0; int shift = 24, idx = 0; #if BITS_PER_LONG == 64 shift = 56; #endif while (shift >= 0) { tmp |= (unsigned long)str[idx++] << shift; shift -= BITS_PER_BYTE; } return tmp; } /* * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since * MSB and LSB are reversed in a byte by f2fs_set_bit. */ static inline unsigned long __reverse_ffs(unsigned long word) { int num = 0; #if BITS_PER_LONG == 64 if ((word & 0xffffffff00000000UL) == 0) num += 32; else word >>= 32; #endif if ((word & 0xffff0000) == 0) num += 16; else word >>= 16; if ((word & 0xff00) == 0) num += 8; else word >>= 8; if ((word & 0xf0) == 0) num += 4; else word >>= 4; if ((word & 0xc) == 0) num += 2; else word >>= 2; if ((word & 0x2) == 0) num += 1; return num; } /* * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because * f2fs_set_bit makes MSB and LSB reversed in a byte. * @size must be integral times of unsigned long. * Example: * MSB <--> LSB * f2fs_set_bit(0, bitmap) => 1000 0000 * f2fs_set_bit(7, bitmap) => 0000 0001 */ static unsigned long __find_rev_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BIT_WORD(offset); unsigned long result = size; unsigned long tmp; if (offset >= size) return size; size -= (offset & ~(BITS_PER_LONG - 1)); offset %= BITS_PER_LONG; while (1) { if (*p == 0) goto pass; tmp = __reverse_ulong((unsigned char *)p); tmp &= ~0UL >> offset; if (size < BITS_PER_LONG) tmp &= (~0UL << (BITS_PER_LONG - size)); if (tmp) goto found; pass: if (size <= BITS_PER_LONG) break; size -= BITS_PER_LONG; offset = 0; p++; } return result; found: return result - size + __reverse_ffs(tmp); } static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BIT_WORD(offset); unsigned long result = size; unsigned long tmp; if (offset >= size) return size; size -= (offset & ~(BITS_PER_LONG - 1)); offset %= BITS_PER_LONG; while (1) { if (*p == ~0UL) goto pass; tmp = __reverse_ulong((unsigned char *)p); if (offset) tmp |= ~0UL << (BITS_PER_LONG - offset); if (size < BITS_PER_LONG) tmp |= ~0UL >> size; if (tmp != ~0UL) goto found; pass: if (size <= BITS_PER_LONG) break; size -= BITS_PER_LONG; offset = 0; p++; } return result; found: return result - size + __reverse_ffz(tmp); } bool f2fs_need_SSR(struct f2fs_sb_info *sbi) { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); if (f2fs_lfs_mode(sbi)) return false; if (sbi->gc_mode == GC_URGENT_HIGH) return true; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return true; return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + SM_I(sbi)->min_ssr_sections + reserved_sections(sbi)); } void f2fs_abort_atomic_write(struct inode *inode, bool clean) { struct f2fs_inode_info *fi = F2FS_I(inode); if (!f2fs_is_atomic_file(inode)) return; if (clean) truncate_inode_pages_final(inode->i_mapping); release_atomic_write_cnt(inode); clear_inode_flag(inode, FI_ATOMIC_COMMITTED); clear_inode_flag(inode, FI_ATOMIC_REPLACE); clear_inode_flag(inode, FI_ATOMIC_FILE); if (is_inode_flag_set(inode, FI_ATOMIC_DIRTIED)) { clear_inode_flag(inode, FI_ATOMIC_DIRTIED); /* * The vfs inode keeps clean during commit, but the f2fs inode * doesn't. So clear the dirty state after commit and let * f2fs_mark_inode_dirty_sync ensure a consistent dirty state. */ f2fs_inode_synced(inode); f2fs_mark_inode_dirty_sync(inode, true); } stat_dec_atomic_inode(inode); F2FS_I(inode)->atomic_write_task = NULL; if (clean) { f2fs_i_size_write(inode, fi->original_i_size); fi->original_i_size = 0; } /* avoid stale dirty inode during eviction */ sync_inode_metadata(inode, 0); } static int __replace_atomic_write_block(struct inode *inode, pgoff_t index, block_t new_addr, block_t *old_addr, bool recover) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; struct node_info ni; int err; retry: set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE); if (err) { if (err == -ENOMEM) { memalloc_retry_wait(GFP_NOFS); goto retry; } return err; } err = f2fs_get_node_info(sbi, dn.nid, &ni, false); if (err) { f2fs_put_dnode(&dn); return err; } if (recover) { /* dn.data_blkaddr is always valid */ if (!__is_valid_data_blkaddr(new_addr)) { if (new_addr == NULL_ADDR) dec_valid_block_count(sbi, inode, 1); f2fs_invalidate_blocks(sbi, dn.data_blkaddr, 1); f2fs_update_data_blkaddr(&dn, new_addr); } else { f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr, ni.version, true, true); } } else { blkcnt_t count = 1; err = inc_valid_block_count(sbi, inode, &count, true); if (err) { f2fs_put_dnode(&dn); return err; } *old_addr = dn.data_blkaddr; f2fs_truncate_data_blocks_range(&dn, 1); dec_valid_block_count(sbi, F2FS_I(inode)->cow_inode, count); f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr, ni.version, true, false); } f2fs_put_dnode(&dn); trace_f2fs_replace_atomic_write_block(inode, F2FS_I(inode)->cow_inode, index, old_addr ? *old_addr : 0, new_addr, recover); return 0; } static void __complete_revoke_list(struct inode *inode, struct list_head *head, bool revoke) { struct revoke_entry *cur, *tmp; pgoff_t start_index = 0; bool truncate = is_inode_flag_set(inode, FI_ATOMIC_REPLACE); list_for_each_entry_safe(cur, tmp, head, list) { if (revoke) { __replace_atomic_write_block(inode, cur->index, cur->old_addr, NULL, true); } else if (truncate) { f2fs_truncate_hole(inode, start_index, cur->index); start_index = cur->index + 1; } list_del(&cur->list); kmem_cache_free(revoke_entry_slab, cur); } if (!revoke && truncate) f2fs_do_truncate_blocks(inode, start_index * PAGE_SIZE, false); } static int __f2fs_commit_atomic_write(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); struct inode *cow_inode = fi->cow_inode; struct revoke_entry *new; struct list_head revoke_list; block_t blkaddr; struct dnode_of_data dn; pgoff_t len = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); pgoff_t off = 0, blen, index; int ret = 0, i; INIT_LIST_HEAD(&revoke_list); while (len) { blen = min_t(pgoff_t, ADDRS_PER_BLOCK(cow_inode), len); set_new_dnode(&dn, cow_inode, NULL, NULL, 0); ret = f2fs_get_dnode_of_data(&dn, off, LOOKUP_NODE_RA); if (ret && ret != -ENOENT) { goto out; } else if (ret == -ENOENT) { ret = 0; if (dn.max_level == 0) goto out; goto next; } blen = min((pgoff_t)ADDRS_PER_PAGE(dn.node_folio, cow_inode), len); index = off; for (i = 0; i < blen; i++, dn.ofs_in_node++, index++) { blkaddr = f2fs_data_blkaddr(&dn); if (!__is_valid_data_blkaddr(blkaddr)) { continue; } else if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) { f2fs_put_dnode(&dn); ret = -EFSCORRUPTED; goto out; } new = f2fs_kmem_cache_alloc(revoke_entry_slab, GFP_NOFS, true, NULL); ret = __replace_atomic_write_block(inode, index, blkaddr, &new->old_addr, false); if (ret) { f2fs_put_dnode(&dn); kmem_cache_free(revoke_entry_slab, new); goto out; } f2fs_update_data_blkaddr(&dn, NULL_ADDR); new->index = index; list_add_tail(&new->list, &revoke_list); } f2fs_put_dnode(&dn); next: off += blen; len -= blen; } out: if (time_to_inject(sbi, FAULT_TIMEOUT)) f2fs_io_schedule_timeout_killable(DEFAULT_FAULT_TIMEOUT); if (ret) { sbi->revoked_atomic_block += fi->atomic_write_cnt; } else { sbi->committed_atomic_block += fi->atomic_write_cnt; set_inode_flag(inode, FI_ATOMIC_COMMITTED); /* * inode may has no FI_ATOMIC_DIRTIED flag due to no write * before commit. */ if (is_inode_flag_set(inode, FI_ATOMIC_DIRTIED)) { /* clear atomic dirty status and set vfs dirty status */ clear_inode_flag(inode, FI_ATOMIC_DIRTIED); f2fs_mark_inode_dirty_sync(inode, true); } } __complete_revoke_list(inode, &revoke_list, ret ? true : false); return ret; } int f2fs_commit_atomic_write(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); int err; err = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (err) return err; f2fs_down_write(&fi->i_gc_rwsem[WRITE]); f2fs_lock_op(sbi); err = __f2fs_commit_atomic_write(inode); f2fs_unlock_op(sbi); f2fs_up_write(&fi->i_gc_rwsem[WRITE]); return err; } /* * This function balances dirty node and dentry pages. * In addition, it controls garbage collection. */ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { if (f2fs_cp_error(sbi)) return; if (time_to_inject(sbi, FAULT_CHECKPOINT)) f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_FAULT_INJECT); /* balance_fs_bg is able to be pending */ if (need && excess_cached_nats(sbi)) f2fs_balance_fs_bg(sbi, false); if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return; /* * We should do GC or end up with checkpoint, if there are so many dirty * dir/node pages without enough free segments. */ if (has_enough_free_secs(sbi, 0, 0)) return; if (test_opt(sbi, GC_MERGE) && sbi->gc_thread && sbi->gc_thread->f2fs_gc_task) { DEFINE_WAIT(wait); prepare_to_wait(&sbi->gc_thread->fggc_wq, &wait, TASK_UNINTERRUPTIBLE); wake_up(&sbi->gc_thread->gc_wait_queue_head); io_schedule(); finish_wait(&sbi->gc_thread->fggc_wq, &wait); } else { struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, .init_gc_type = f2fs_sb_has_blkzoned(sbi) ? FG_GC : BG_GC, .no_bg_gc = true, .should_migrate_blocks = false, .err_gc_skipped = false, .nr_free_secs = 1 }; f2fs_down_write(&sbi->gc_lock); stat_inc_gc_call_count(sbi, FOREGROUND); f2fs_gc(sbi, &gc_control); } } static inline bool excess_dirty_threshold(struct f2fs_sb_info *sbi) { int factor = f2fs_rwsem_is_locked(&sbi->cp_rwsem) ? 3 : 2; unsigned int dents = get_pages(sbi, F2FS_DIRTY_DENTS); unsigned int qdata = get_pages(sbi, F2FS_DIRTY_QDATA); unsigned int nodes = get_pages(sbi, F2FS_DIRTY_NODES); unsigned int meta = get_pages(sbi, F2FS_DIRTY_META); unsigned int imeta = get_pages(sbi, F2FS_DIRTY_IMETA); unsigned int threshold = SEGS_TO_BLKS(sbi, (factor * DEFAULT_DIRTY_THRESHOLD)); unsigned int global_threshold = threshold * 3 / 2; if (dents >= threshold || qdata >= threshold || nodes >= threshold || meta >= threshold || imeta >= threshold) return true; return dents + qdata + nodes + meta + imeta > global_threshold; } void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) { if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return; /* try to shrink extent cache when there is no enough memory */ if (!f2fs_available_free_memory(sbi, READ_EXTENT_CACHE)) f2fs_shrink_read_extent_tree(sbi, READ_EXTENT_CACHE_SHRINK_NUMBER); /* try to shrink age extent cache when there is no enough memory */ if (!f2fs_available_free_memory(sbi, AGE_EXTENT_CACHE)) f2fs_shrink_age_extent_tree(sbi, AGE_EXTENT_CACHE_SHRINK_NUMBER); /* check the # of cached NAT entries */ if (!f2fs_available_free_memory(sbi, NAT_ENTRIES)) f2fs_try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK); if (!f2fs_available_free_memory(sbi, FREE_NIDS)) f2fs_try_to_free_nids(sbi, MAX_FREE_NIDS); else f2fs_build_free_nids(sbi, false, false); if (excess_dirty_nats(sbi) || excess_dirty_threshold(sbi) || excess_prefree_segs(sbi) || !f2fs_space_for_roll_forward(sbi)) goto do_sync; /* there is background inflight IO or foreground operation recently */ if (is_inflight_io(sbi, REQ_TIME) || (!f2fs_time_over(sbi, REQ_TIME) && f2fs_rwsem_is_locked(&sbi->cp_rwsem))) return; /* exceed periodical checkpoint timeout threshold */ if (f2fs_time_over(sbi, CP_TIME)) goto do_sync; /* checkpoint is the only way to shrink partial cached entries */ if (f2fs_available_free_memory(sbi, NAT_ENTRIES) && f2fs_available_free_memory(sbi, INO_ENTRIES)) return; do_sync: if (test_opt(sbi, DATA_FLUSH) && from_bg) { struct blk_plug plug; mutex_lock(&sbi->flush_lock); blk_start_plug(&plug); f2fs_sync_dirty_inodes(sbi, FILE_INODE, false); blk_finish_plug(&plug); mutex_unlock(&sbi->flush_lock); } stat_inc_cp_call_count(sbi, BACKGROUND); f2fs_sync_fs(sbi->sb, 1); } static int __submit_flush_wait(struct f2fs_sb_info *sbi, struct block_device *bdev) { int ret = blkdev_issue_flush(bdev); trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER), test_opt(sbi, FLUSH_MERGE), ret); if (!ret) f2fs_update_iostat(sbi, NULL, FS_FLUSH_IO, 0); return ret; } static int submit_flush_wait(struct f2fs_sb_info *sbi, nid_t ino) { int ret = 0; int i; if (!f2fs_is_multi_device(sbi)) return __submit_flush_wait(sbi, sbi->sb->s_bdev); for (i = 0; i < sbi->s_ndevs; i++) { if (!f2fs_is_dirty_device(sbi, ino, i, FLUSH_INO)) continue; ret = __submit_flush_wait(sbi, FDEV(i).bdev); if (ret) break; } return ret; } static int issue_flush_thread(void *data) { struct f2fs_sb_info *sbi = data; struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; wait_queue_head_t *q = &fcc->flush_wait_queue; repeat: if (kthread_should_stop()) return 0; if (!llist_empty(&fcc->issue_list)) { struct flush_cmd *cmd, *next; int ret; fcc->dispatch_list = llist_del_all(&fcc->issue_list); fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); cmd = llist_entry(fcc->dispatch_list, struct flush_cmd, llnode); ret = submit_flush_wait(sbi, cmd->ino); atomic_inc(&fcc->issued_flush); llist_for_each_entry_safe(cmd, next, fcc->dispatch_list, llnode) { cmd->ret = ret; complete(&cmd->wait); } fcc->dispatch_list = NULL; } wait_event_interruptible(*q, kthread_should_stop() || !llist_empty(&fcc->issue_list)); goto repeat; } int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino) { struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; struct flush_cmd cmd; int ret; if (test_opt(sbi, NOBARRIER)) return 0; if (!test_opt(sbi, FLUSH_MERGE)) { atomic_inc(&fcc->queued_flush); ret = submit_flush_wait(sbi, ino); atomic_dec(&fcc->queued_flush); atomic_inc(&fcc->issued_flush); return ret; } if (atomic_inc_return(&fcc->queued_flush) == 1 || f2fs_is_multi_device(sbi)) { ret = submit_flush_wait(sbi, ino); atomic_dec(&fcc->queued_flush); atomic_inc(&fcc->issued_flush); return ret; } cmd.ino = ino; init_completion(&cmd.wait); llist_add(&cmd.llnode, &fcc->issue_list); /* * update issue_list before we wake up issue_flush thread, this * smp_mb() pairs with another barrier in ___wait_event(), see * more details in comments of waitqueue_active(). */ smp_mb(); if (waitqueue_active(&fcc->flush_wait_queue)) wake_up(&fcc->flush_wait_queue); if (fcc->f2fs_issue_flush) { wait_for_completion(&cmd.wait); atomic_dec(&fcc->queued_flush); } else { struct llist_node *list; list = llist_del_all(&fcc->issue_list); if (!list) { wait_for_completion(&cmd.wait); atomic_dec(&fcc->queued_flush); } else { struct flush_cmd *tmp, *next; ret = submit_flush_wait(sbi, ino); llist_for_each_entry_safe(tmp, next, list, llnode) { if (tmp == &cmd) { cmd.ret = ret; atomic_dec(&fcc->queued_flush); continue; } tmp->ret = ret; complete(&tmp->wait); } } } return cmd.ret; } int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; struct flush_cmd_control *fcc; if (SM_I(sbi)->fcc_info) { fcc = SM_I(sbi)->fcc_info; if (fcc->f2fs_issue_flush) return 0; goto init_thread; } fcc = f2fs_kzalloc(sbi, sizeof(struct flush_cmd_control), GFP_KERNEL); if (!fcc) return -ENOMEM; atomic_set(&fcc->issued_flush, 0); atomic_set(&fcc->queued_flush, 0); init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); SM_I(sbi)->fcc_info = fcc; if (!test_opt(sbi, FLUSH_MERGE)) return 0; init_thread: fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { int err = PTR_ERR(fcc->f2fs_issue_flush); fcc->f2fs_issue_flush = NULL; return err; } return 0; } void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) { struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; if (fcc && fcc->f2fs_issue_flush) { struct task_struct *flush_thread = fcc->f2fs_issue_flush; fcc->f2fs_issue_flush = NULL; kthread_stop(flush_thread); } if (free) { kfree(fcc); SM_I(sbi)->fcc_info = NULL; } } int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) { int ret = 0, i; if (!f2fs_is_multi_device(sbi)) return 0; if (test_opt(sbi, NOBARRIER)) return 0; for (i = 1; i < sbi->s_ndevs; i++) { int count = DEFAULT_RETRY_IO_COUNT; if (!f2fs_test_bit(i, (char *)&sbi->dirty_device)) continue; do { ret = __submit_flush_wait(sbi, FDEV(i).bdev); if (ret) f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT); } while (ret && --count); if (ret) { f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_FLUSH_FAIL); break; } spin_lock(&sbi->dev_lock); f2fs_clear_bit(i, (char *)&sbi->dirty_device); spin_unlock(&sbi->dev_lock); } return ret; } static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, enum dirty_type dirty_type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); /* need not be added */ if (is_curseg(sbi, segno)) return; if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type])) dirty_i->nr_dirty[dirty_type]++; if (dirty_type == DIRTY) { struct seg_entry *sentry = get_seg_entry(sbi, segno); enum dirty_type t = sentry->type; if (unlikely(t >= DIRTY)) { f2fs_bug_on(sbi, 1); return; } if (!test_and_set_bit(segno, dirty_i->dirty_segmap[t])) dirty_i->nr_dirty[t]++; if (__is_large_section(sbi)) { unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); block_t valid_blocks = get_valid_blocks(sbi, segno, true); f2fs_bug_on(sbi, (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) && !valid_blocks) || valid_blocks == CAP_BLKS_PER_SEC(sbi)); if (!is_cursec(sbi, secno)) set_bit(secno, dirty_i->dirty_secmap); } } } static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, enum dirty_type dirty_type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); block_t valid_blocks; if (test_and_clear_bit(segno, dirty_i->dirty_segmap[dirty_type])) dirty_i->nr_dirty[dirty_type]--; if (dirty_type == DIRTY) { struct seg_entry *sentry = get_seg_entry(sbi, segno); enum dirty_type t = sentry->type; if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) dirty_i->nr_dirty[t]--; valid_blocks = get_valid_blocks(sbi, segno, true); if (valid_blocks == 0) { clear_bit(GET_SEC_FROM_SEG(sbi, segno), dirty_i->victim_secmap); #ifdef CONFIG_F2FS_CHECK_FS clear_bit(segno, SIT_I(sbi)->invalid_segmap); #endif } if (__is_large_section(sbi)) { unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); if (!valid_blocks || valid_blocks == CAP_BLKS_PER_SEC(sbi)) { clear_bit(secno, dirty_i->dirty_secmap); return; } if (!is_cursec(sbi, secno)) set_bit(secno, dirty_i->dirty_secmap); } } } /* * Should not occur error such as -ENOMEM. * Adding dirty entry into seglist is not critical operation. * If a given segment is one of current working segments, it won't be added. */ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned short valid_blocks, ckpt_valid_blocks; unsigned int usable_blocks; if (segno == NULL_SEGNO || is_curseg(sbi, segno)) return; usable_blocks = f2fs_usable_blks_in_seg(sbi, segno); mutex_lock(&dirty_i->seglist_lock); valid_blocks = get_valid_blocks(sbi, segno, false); ckpt_valid_blocks = get_ckpt_valid_blocks(sbi, segno, false); if (valid_blocks == 0 && (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) || ckpt_valid_blocks == usable_blocks)) { __locate_dirty_segment(sbi, segno, PRE); __remove_dirty_segment(sbi, segno, DIRTY); } else if (valid_blocks < usable_blocks) { __locate_dirty_segment(sbi, segno, DIRTY); } else { /* Recovery routine with SSR needs this */ __remove_dirty_segment(sbi, segno, DIRTY); } mutex_unlock(&dirty_i->seglist_lock); } /* This moves currently empty dirty blocks to prefree. Must hold seglist_lock */ void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int segno; mutex_lock(&dirty_i->seglist_lock); for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { if (get_valid_blocks(sbi, segno, false)) continue; if (is_curseg(sbi, segno)) continue; __locate_dirty_segment(sbi, segno, PRE); __remove_dirty_segment(sbi, segno, DIRTY); } mutex_unlock(&dirty_i->seglist_lock); } block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi) { int ovp_hole_segs = (overprovision_segments(sbi) - reserved_segments(sbi)); block_t ovp_holes = SEGS_TO_BLKS(sbi, ovp_hole_segs); struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); block_t holes[2] = {0, 0}; /* DATA and NODE */ block_t unusable; struct seg_entry *se; unsigned int segno; mutex_lock(&dirty_i->seglist_lock); for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { se = get_seg_entry(sbi, segno); if (IS_NODESEG(se->type)) holes[NODE] += f2fs_usable_blks_in_seg(sbi, segno) - se->valid_blocks; else holes[DATA] += f2fs_usable_blks_in_seg(sbi, segno) - se->valid_blocks; } mutex_unlock(&dirty_i->seglist_lock); unusable = max(holes[DATA], holes[NODE]); if (unusable > ovp_holes) return unusable - ovp_holes; return 0; } int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable) { int ovp_hole_segs = (overprovision_segments(sbi) - reserved_segments(sbi)); if (F2FS_OPTION(sbi).unusable_cap_perc == 100) return 0; if (unusable > F2FS_OPTION(sbi).unusable_cap) return -EAGAIN; if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK) && dirty_segments(sbi) > ovp_hole_segs) return -EAGAIN; if (has_not_enough_free_secs(sbi, 0, 0)) return -EAGAIN; return 0; } /* This is only used by SBI_CP_DISABLED */ static unsigned int get_free_segment(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int segno = 0; mutex_lock(&dirty_i->seglist_lock); for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { if (get_valid_blocks(sbi, segno, false)) continue; if (get_ckpt_valid_blocks(sbi, segno, false)) continue; mutex_unlock(&dirty_i->seglist_lock); return segno; } mutex_unlock(&dirty_i->seglist_lock); return NULL_SEGNO; } static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t lstart, block_t start, block_t len) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *pend_list; struct discard_cmd *dc; f2fs_bug_on(sbi, !len); pend_list = &dcc->pend_list[plist_idx(len)]; dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS, true, NULL); INIT_LIST_HEAD(&dc->list); dc->bdev = bdev; dc->di.lstart = lstart; dc->di.start = start; dc->di.len = len; dc->ref = 0; dc->state = D_PREP; dc->queued = 0; dc->error = 0; init_completion(&dc->wait); list_add_tail(&dc->list, pend_list); spin_lock_init(&dc->lock); dc->bio_ref = 0; atomic_inc(&dcc->discard_cmd_cnt); dcc->undiscard_blks += len; return dc; } static bool f2fs_check_discard_tree(struct f2fs_sb_info *sbi) { #ifdef CONFIG_F2FS_CHECK_FS struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct rb_node *cur = rb_first_cached(&dcc->root), *next; struct discard_cmd *cur_dc, *next_dc; while (cur) { next = rb_next(cur); if (!next) return true; cur_dc = rb_entry(cur, struct discard_cmd, rb_node); next_dc = rb_entry(next, struct discard_cmd, rb_node); if (cur_dc->di.lstart + cur_dc->di.len > next_dc->di.lstart) { f2fs_info(sbi, "broken discard_rbtree, " "cur(%u, %u) next(%u, %u)", cur_dc->di.lstart, cur_dc->di.len, next_dc->di.lstart, next_dc->di.len); return false; } cur = next; } #endif return true; } static struct discard_cmd *__lookup_discard_cmd(struct f2fs_sb_info *sbi, block_t blkaddr) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct rb_node *node = dcc->root.rb_root.rb_node; struct discard_cmd *dc; while (node) { dc = rb_entry(node, struct discard_cmd, rb_node); if (blkaddr < dc->di.lstart) node = node->rb_left; else if (blkaddr >= dc->di.lstart + dc->di.len) node = node->rb_right; else return dc; } return NULL; } static struct discard_cmd *__lookup_discard_cmd_ret(struct rb_root_cached *root, block_t blkaddr, struct discard_cmd **prev_entry, struct discard_cmd **next_entry, struct rb_node ***insert_p, struct rb_node **insert_parent) { struct rb_node **pnode = &root->rb_root.rb_node; struct rb_node *parent = NULL, *tmp_node; struct discard_cmd *dc; *insert_p = NULL; *insert_parent = NULL; *prev_entry = NULL; *next_entry = NULL; if (RB_EMPTY_ROOT(&root->rb_root)) return NULL; while (*pnode) { parent = *pnode; dc = rb_entry(*pnode, struct discard_cmd, rb_node); if (blkaddr < dc->di.lstart) pnode = &(*pnode)->rb_left; else if (blkaddr >= dc->di.lstart + dc->di.len) pnode = &(*pnode)->rb_right; else goto lookup_neighbors; } *insert_p = pnode; *insert_parent = parent; dc = rb_entry(parent, struct discard_cmd, rb_node); tmp_node = parent; if (parent && blkaddr > dc->di.lstart) tmp_node = rb_next(parent); *next_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node); tmp_node = parent; if (parent && blkaddr < dc->di.lstart) tmp_node = rb_prev(parent); *prev_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node); return NULL; lookup_neighbors: /* lookup prev node for merging backward later */ tmp_node = rb_prev(&dc->rb_node); *prev_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node); /* lookup next node for merging frontward later */ tmp_node = rb_next(&dc->rb_node); *next_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node); return dc; } static void __detach_discard_cmd(struct discard_cmd_control *dcc, struct discard_cmd *dc) { if (dc->state == D_DONE) atomic_sub(dc->queued, &dcc->queued_discard); list_del(&dc->list); rb_erase_cached(&dc->rb_node, &dcc->root); dcc->undiscard_blks -= dc->di.len; kmem_cache_free(discard_cmd_slab, dc); atomic_dec(&dcc->discard_cmd_cnt); } static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; unsigned long flags; trace_f2fs_remove_discard(dc->bdev, dc->di.start, dc->di.len); spin_lock_irqsave(&dc->lock, flags); if (dc->bio_ref) { spin_unlock_irqrestore(&dc->lock, flags); return; } spin_unlock_irqrestore(&dc->lock, flags); f2fs_bug_on(sbi, dc->ref); if (dc->error == -EOPNOTSUPP) dc->error = 0; if (dc->error) f2fs_info_ratelimited(sbi, "Issue discard(%u, %u, %u) failed, ret: %d", dc->di.lstart, dc->di.start, dc->di.len, dc->error); __detach_discard_cmd(dcc, dc); } static void f2fs_submit_discard_endio(struct bio *bio) { struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; unsigned long flags; spin_lock_irqsave(&dc->lock, flags); if (!dc->error) dc->error = blk_status_to_errno(bio->bi_status); dc->bio_ref--; if (!dc->bio_ref && dc->state == D_SUBMIT) { dc->state = D_DONE; complete_all(&dc->wait); } spin_unlock_irqrestore(&dc->lock, flags); bio_put(bio); } static void __check_sit_bitmap(struct f2fs_sb_info *sbi, block_t start, block_t end) { #ifdef CONFIG_F2FS_CHECK_FS struct seg_entry *sentry; unsigned int segno; block_t blk = start; unsigned long offset, size, *map; while (blk < end) { segno = GET_SEGNO(sbi, blk); sentry = get_seg_entry(sbi, segno); offset = GET_BLKOFF_FROM_SEG0(sbi, blk); if (end < START_BLOCK(sbi, segno + 1)) size = GET_BLKOFF_FROM_SEG0(sbi, end); else size = BLKS_PER_SEG(sbi); map = (unsigned long *)(sentry->cur_valid_map); offset = __find_rev_next_bit(map, size, offset); f2fs_bug_on(sbi, offset != size); blk = START_BLOCK(sbi, segno + 1); } #endif } static void __init_discard_policy(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, int discard_type, unsigned int granularity) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; /* common policy */ dpolicy->type = discard_type; dpolicy->sync = true; dpolicy->ordered = false; dpolicy->granularity = granularity; dpolicy->max_requests = dcc->max_discard_request; dpolicy->io_aware_gran = dcc->discard_io_aware_gran; dpolicy->timeout = false; if (discard_type == DPOLICY_BG) { dpolicy->min_interval = dcc->min_discard_issue_time; dpolicy->mid_interval = dcc->mid_discard_issue_time; dpolicy->max_interval = dcc->max_discard_issue_time; if (dcc->discard_io_aware == DPOLICY_IO_AWARE_ENABLE) dpolicy->io_aware = true; else if (dcc->discard_io_aware == DPOLICY_IO_AWARE_DISABLE) dpolicy->io_aware = false; dpolicy->sync = false; dpolicy->ordered = true; if (utilization(sbi) > dcc->discard_urgent_util) { dpolicy->granularity = MIN_DISCARD_GRANULARITY; if (atomic_read(&dcc->discard_cmd_cnt)) dpolicy->max_interval = dcc->min_discard_issue_time; } } else if (discard_type == DPOLICY_FORCE) { dpolicy->min_interval = dcc->min_discard_issue_time; dpolicy->mid_interval = dcc->mid_discard_issue_time; dpolicy->max_interval = dcc->max_discard_issue_time; dpolicy->io_aware = false; } else if (discard_type == DPOLICY_FSTRIM) { dpolicy->io_aware = false; } else if (discard_type == DPOLICY_UMOUNT) { dpolicy->io_aware = false; /* we need to issue all to keep CP_TRIMMED_FLAG */ dpolicy->granularity = MIN_DISCARD_GRANULARITY; dpolicy->timeout = true; } } static void __update_discard_tree_range(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t lstart, block_t start, block_t len); #ifdef CONFIG_BLK_DEV_ZONED static void __submit_zone_reset_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc, blk_opf_t flag, struct list_head *wait_list, unsigned int *issued) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct block_device *bdev = dc->bdev; struct bio *bio = bio_alloc(bdev, 0, REQ_OP_ZONE_RESET | flag, GFP_NOFS); unsigned long flags; trace_f2fs_issue_reset_zone(bdev, dc->di.start); spin_lock_irqsave(&dc->lock, flags); dc->state = D_SUBMIT; dc->bio_ref++; spin_unlock_irqrestore(&dc->lock, flags); if (issued) (*issued)++; atomic_inc(&dcc->queued_discard); dc->queued++; list_move_tail(&dc->list, wait_list); /* sanity check on discard range */ __check_sit_bitmap(sbi, dc->di.lstart, dc->di.lstart + dc->di.len); bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(dc->di.start); bio->bi_private = dc; bio->bi_end_io = f2fs_submit_discard_endio; submit_bio(bio); atomic_inc(&dcc->issued_discard); f2fs_update_iostat(sbi, NULL, FS_ZONE_RESET_IO, dc->di.len * F2FS_BLKSIZE); } #endif /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, struct discard_cmd *dc, int *issued) { struct block_device *bdev = dc->bdev; unsigned int max_discard_blocks = SECTOR_TO_BLOCK(bdev_max_discard_sectors(bdev)); struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? &(dcc->fstrim_list) : &(dcc->wait_list); blk_opf_t flag = dpolicy->sync ? REQ_SYNC : 0; block_t lstart, start, len, total_len; int err = 0; if (dc->state != D_PREP) return 0; if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) return 0; #ifdef CONFIG_BLK_DEV_ZONED if (f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(bdev)) { int devi = f2fs_bdev_index(sbi, bdev); if (devi < 0) return -EINVAL; if (f2fs_blkz_is_seq(sbi, devi, dc->di.start)) { __submit_zone_reset_cmd(sbi, dc, flag, wait_list, issued); return 0; } } #endif /* * stop issuing discard for any of below cases: * 1. device is conventional zone, but it doesn't support discard. * 2. device is regulare device, after snapshot it doesn't support * discard. */ if (!bdev_max_discard_sectors(bdev)) return -EOPNOTSUPP; trace_f2fs_issue_discard(bdev, dc->di.start, dc->di.len); lstart = dc->di.lstart; start = dc->di.start; len = dc->di.len; total_len = len; dc->di.len = 0; while (total_len && *issued < dpolicy->max_requests && !err) { struct bio *bio = NULL; unsigned long flags; bool last = true; if (len > max_discard_blocks) { len = max_discard_blocks; last = false; } (*issued)++; if (*issued == dpolicy->max_requests) last = true; dc->di.len += len; err = 0; if (time_to_inject(sbi, FAULT_DISCARD)) { err = -EIO; spin_lock_irqsave(&dc->lock, flags); if (dc->state == D_PARTIAL) dc->state = D_SUBMIT; spin_unlock_irqrestore(&dc->lock, flags); break; } __blkdev_issue_discard(bdev, SECTOR_FROM_BLOCK(start), SECTOR_FROM_BLOCK(len), GFP_NOFS, &bio); f2fs_bug_on(sbi, !bio); /* * should keep before submission to avoid D_DONE * right away */ spin_lock_irqsave(&dc->lock, flags); if (last) dc->state = D_SUBMIT; else dc->state = D_PARTIAL; dc->bio_ref++; spin_unlock_irqrestore(&dc->lock, flags); atomic_inc(&dcc->queued_discard); dc->queued++; list_move_tail(&dc->list, wait_list); /* sanity check on discard range */ __check_sit_bitmap(sbi, lstart, lstart + len); bio->bi_private = dc; bio->bi_end_io = f2fs_submit_discard_endio; bio->bi_opf |= flag; submit_bio(bio); atomic_inc(&dcc->issued_discard); f2fs_update_iostat(sbi, NULL, FS_DISCARD_IO, len * F2FS_BLKSIZE); lstart += len; start += len; total_len -= len; len = total_len; } if (!err && len) { dcc->undiscard_blks -= len; __update_discard_tree_range(sbi, bdev, lstart, start, len); } return err; } static void __insert_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t lstart, block_t start, block_t len) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct rb_node **p = &dcc->root.rb_root.rb_node; struct rb_node *parent = NULL; struct discard_cmd *dc; bool leftmost = true; /* look up rb tree to find parent node */ while (*p) { parent = *p; dc = rb_entry(parent, struct discard_cmd, rb_node); if (lstart < dc->di.lstart) { p = &(*p)->rb_left; } else if (lstart >= dc->di.lstart + dc->di.len) { p = &(*p)->rb_right; leftmost = false; } else { /* Let's skip to add, if exists */ return; } } dc = __create_discard_cmd(sbi, bdev, lstart, start, len); rb_link_node(&dc->rb_node, parent, p); rb_insert_color_cached(&dc->rb_node, &dcc->root, leftmost); } static void __relocate_discard_cmd(struct discard_cmd_control *dcc, struct discard_cmd *dc) { list_move_tail(&dc->list, &dcc->pend_list[plist_idx(dc->di.len)]); } static void __punch_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc, block_t blkaddr) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_info di = dc->di; bool modified = false; if (dc->state == D_DONE || dc->di.len == 1) { __remove_discard_cmd(sbi, dc); return; } dcc->undiscard_blks -= di.len; if (blkaddr > di.lstart) { dc->di.len = blkaddr - dc->di.lstart; dcc->undiscard_blks += dc->di.len; __relocate_discard_cmd(dcc, dc); modified = true; } if (blkaddr < di.lstart + di.len - 1) { if (modified) { __insert_discard_cmd(sbi, dc->bdev, blkaddr + 1, di.start + blkaddr + 1 - di.lstart, di.lstart + di.len - 1 - blkaddr); } else { dc->di.lstart++; dc->di.len--; dc->di.start++; dcc->undiscard_blks += dc->di.len; __relocate_discard_cmd(dcc, dc); } } } static void __update_discard_tree_range(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t lstart, block_t start, block_t len) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_cmd *prev_dc = NULL, *next_dc = NULL; struct discard_cmd *dc; struct discard_info di = {0}; struct rb_node **insert_p = NULL, *insert_parent = NULL; unsigned int max_discard_blocks = SECTOR_TO_BLOCK(bdev_max_discard_sectors(bdev)); block_t end = lstart + len; dc = __lookup_discard_cmd_ret(&dcc->root, lstart, &prev_dc, &next_dc, &insert_p, &insert_parent); if (dc) prev_dc = dc; if (!prev_dc) { di.lstart = lstart; di.len = next_dc ? next_dc->di.lstart - lstart : len; di.len = min(di.len, len); di.start = start; } while (1) { struct rb_node *node; bool merged = false; struct discard_cmd *tdc = NULL; if (prev_dc) { di.lstart = prev_dc->di.lstart + prev_dc->di.len; if (di.lstart < lstart) di.lstart = lstart; if (di.lstart >= end) break; if (!next_dc || next_dc->di.lstart > end) di.len = end - di.lstart; else di.len = next_dc->di.lstart - di.lstart; di.start = start + di.lstart - lstart; } if (!di.len) goto next; if (prev_dc && prev_dc->state == D_PREP && prev_dc->bdev == bdev && __is_discard_back_mergeable(&di, &prev_dc->di, max_discard_blocks)) { prev_dc->di.len += di.len; dcc->undiscard_blks += di.len; __relocate_discard_cmd(dcc, prev_dc); di = prev_dc->di; tdc = prev_dc; merged = true; } if (next_dc && next_dc->state == D_PREP && next_dc->bdev == bdev && __is_discard_front_mergeable(&di, &next_dc->di, max_discard_blocks)) { next_dc->di.lstart = di.lstart; next_dc->di.len += di.len; next_dc->di.start = di.start; dcc->undiscard_blks += di.len; __relocate_discard_cmd(dcc, next_dc); if (tdc) __remove_discard_cmd(sbi, tdc); merged = true; } if (!merged) __insert_discard_cmd(sbi, bdev, di.lstart, di.start, di.len); next: prev_dc = next_dc; if (!prev_dc) break; node = rb_next(&prev_dc->rb_node); next_dc = rb_entry_safe(node, struct discard_cmd, rb_node); } } #ifdef CONFIG_BLK_DEV_ZONED static void __queue_zone_reset_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t lblkstart, block_t blklen) { trace_f2fs_queue_reset_zone(bdev, blkstart); mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock); __insert_discard_cmd(sbi, bdev, lblkstart, blkstart, blklen); mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock); } #endif static void __queue_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { block_t lblkstart = blkstart; if (!f2fs_bdev_support_discard(bdev)) return; trace_f2fs_queue_discard(bdev, blkstart, blklen); if (f2fs_is_multi_device(sbi)) { int devi = f2fs_target_device_index(sbi, blkstart); blkstart -= FDEV(devi).start_blk; } mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock); __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen); mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock); } static void __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, int *issued) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_cmd *prev_dc = NULL, *next_dc = NULL; struct rb_node **insert_p = NULL, *insert_parent = NULL; struct discard_cmd *dc; struct blk_plug plug; bool io_interrupted = false; mutex_lock(&dcc->cmd_lock); dc = __lookup_discard_cmd_ret(&dcc->root, dcc->next_pos, &prev_dc, &next_dc, &insert_p, &insert_parent); if (!dc) dc = next_dc; blk_start_plug(&plug); while (dc) { struct rb_node *node; int err = 0; if (dc->state != D_PREP) goto next; if (dpolicy->io_aware && !is_idle(sbi, DISCARD_TIME)) { io_interrupted = true; break; } dcc->next_pos = dc->di.lstart + dc->di.len; err = __submit_discard_cmd(sbi, dpolicy, dc, issued); if (*issued >= dpolicy->max_requests) break; next: node = rb_next(&dc->rb_node); if (err) __remove_discard_cmd(sbi, dc); dc = rb_entry_safe(node, struct discard_cmd, rb_node); } blk_finish_plug(&plug); if (!dc) dcc->next_pos = 0; mutex_unlock(&dcc->cmd_lock); if (!(*issued) && io_interrupted) *issued = -1; } static unsigned int __wait_all_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy); static int __issue_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *pend_list; struct discard_cmd *dc, *tmp; struct blk_plug plug; int i, issued; bool io_interrupted = false; if (dpolicy->timeout) f2fs_update_time(sbi, UMOUNT_DISCARD_TIMEOUT); retry: issued = 0; for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { if (dpolicy->timeout && f2fs_time_over(sbi, UMOUNT_DISCARD_TIMEOUT)) break; if (i + 1 < dpolicy->granularity) break; if (i + 1 < dcc->max_ordered_discard && dpolicy->ordered) { __issue_discard_cmd_orderly(sbi, dpolicy, &issued); return issued; } pend_list = &dcc->pend_list[i]; mutex_lock(&dcc->cmd_lock); if (list_empty(pend_list)) goto next; if (unlikely(dcc->rbtree_check)) f2fs_bug_on(sbi, !f2fs_check_discard_tree(sbi)); blk_start_plug(&plug); list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); if (dpolicy->timeout && f2fs_time_over(sbi, UMOUNT_DISCARD_TIMEOUT)) break; if (dpolicy->io_aware && i < dpolicy->io_aware_gran && !is_idle(sbi, DISCARD_TIME)) { io_interrupted = true; break; } __submit_discard_cmd(sbi, dpolicy, dc, &issued); if (issued >= dpolicy->max_requests) break; } blk_finish_plug(&plug); next: mutex_unlock(&dcc->cmd_lock); if (issued >= dpolicy->max_requests || io_interrupted) break; } if (dpolicy->type == DPOLICY_UMOUNT && issued) { __wait_all_discard_cmd(sbi, dpolicy); goto retry; } if (!issued && io_interrupted) issued = -1; return issued; } static bool __drop_discard_cmd(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *pend_list; struct discard_cmd *dc, *tmp; int i; bool dropped = false; mutex_lock(&dcc->cmd_lock); for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { pend_list = &dcc->pend_list[i]; list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); __remove_discard_cmd(sbi, dc); dropped = true; } } mutex_unlock(&dcc->cmd_lock); return dropped; } void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi) { __drop_discard_cmd(sbi); } static unsigned int __wait_one_discard_bio(struct f2fs_sb_info *sbi, struct discard_cmd *dc) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; unsigned int len = 0; wait_for_completion_io(&dc->wait); mutex_lock(&dcc->cmd_lock); f2fs_bug_on(sbi, dc->state != D_DONE); dc->ref--; if (!dc->ref) { if (!dc->error) len = dc->di.len; __remove_discard_cmd(sbi, dc); } mutex_unlock(&dcc->cmd_lock); return len; } static unsigned int __wait_discard_cmd_range(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, block_t start, block_t end) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? &(dcc->fstrim_list) : &(dcc->wait_list); struct discard_cmd *dc = NULL, *iter, *tmp; unsigned int trimmed = 0; next: dc = NULL; mutex_lock(&dcc->cmd_lock); list_for_each_entry_safe(iter, tmp, wait_list, list) { if (iter->di.lstart + iter->di.len <= start || end <= iter->di.lstart) continue; if (iter->di.len < dpolicy->granularity) continue; if (iter->state == D_DONE && !iter->ref) { wait_for_completion_io(&iter->wait); if (!iter->error) trimmed += iter->di.len; __remove_discard_cmd(sbi, iter); } else { iter->ref++; dc = iter; break; } } mutex_unlock(&dcc->cmd_lock); if (dc) { trimmed += __wait_one_discard_bio(sbi, dc); goto next; } return trimmed; } static unsigned int __wait_all_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy) { struct discard_policy dp; unsigned int discard_blks; if (dpolicy) return __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX); /* wait all */ __init_discard_policy(sbi, &dp, DPOLICY_FSTRIM, MIN_DISCARD_GRANULARITY); discard_blks = __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); __init_discard_policy(sbi, &dp, DPOLICY_UMOUNT, MIN_DISCARD_GRANULARITY); discard_blks += __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); return discard_blks; } /* This should be covered by global mutex, &sit_i->sentry_lock */ static void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_cmd *dc; bool need_wait = false; mutex_lock(&dcc->cmd_lock); dc = __lookup_discard_cmd(sbi, blkaddr); #ifdef CONFIG_BLK_DEV_ZONED if (dc && f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(dc->bdev)) { int devi = f2fs_bdev_index(sbi, dc->bdev); if (devi < 0) { mutex_unlock(&dcc->cmd_lock); return; } if (f2fs_blkz_is_seq(sbi, devi, dc->di.start)) { /* force submit zone reset */ if (dc->state == D_PREP) __submit_zone_reset_cmd(sbi, dc, REQ_SYNC, &dcc->wait_list, NULL); dc->ref++; mutex_unlock(&dcc->cmd_lock); /* wait zone reset */ __wait_one_discard_bio(sbi, dc); return; } } #endif if (dc) { if (dc->state == D_PREP) { __punch_discard_cmd(sbi, dc, blkaddr); } else { dc->ref++; need_wait = true; } } mutex_unlock(&dcc->cmd_lock); if (need_wait) __wait_one_discard_bio(sbi, dc); } void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; if (dcc && dcc->f2fs_issue_discard) { struct task_struct *discard_thread = dcc->f2fs_issue_discard; dcc->f2fs_issue_discard = NULL; kthread_stop(discard_thread); } } /** * f2fs_issue_discard_timeout() - Issue all discard cmd within UMOUNT_DISCARD_TIMEOUT * @sbi: the f2fs_sb_info data for discard cmd to issue * * When UMOUNT_DISCARD_TIMEOUT is exceeded, all remaining discard commands will be dropped * * Return true if issued all discard cmd or no discard cmd need issue, otherwise return false. */ bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_policy dpolicy; bool dropped; if (!atomic_read(&dcc->discard_cmd_cnt)) return true; __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); __issue_discard_cmd(sbi, &dpolicy); dropped = __drop_discard_cmd(sbi); /* just to make sure there is no pending discard commands */ __wait_all_discard_cmd(sbi, NULL); f2fs_bug_on(sbi, atomic_read(&dcc->discard_cmd_cnt)); return !dropped; } static int issue_discard_thread(void *data) { struct f2fs_sb_info *sbi = data; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; struct discard_policy dpolicy; unsigned int wait_ms = dcc->min_discard_issue_time; int issued; set_freezable(); do { wait_event_freezable_timeout(*q, kthread_should_stop() || dcc->discard_wake, msecs_to_jiffies(wait_ms)); if (sbi->gc_mode == GC_URGENT_HIGH || !f2fs_available_free_memory(sbi, DISCARD_CACHE)) __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, MIN_DISCARD_GRANULARITY); else __init_discard_policy(sbi, &dpolicy, DPOLICY_BG, dcc->discard_granularity); if (dcc->discard_wake) dcc->discard_wake = false; /* clean up pending candidates before going to sleep */ if (atomic_read(&dcc->queued_discard)) __wait_all_discard_cmd(sbi, NULL); if (f2fs_readonly(sbi->sb)) continue; if (kthread_should_stop()) return 0; if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) || !atomic_read(&dcc->discard_cmd_cnt)) { wait_ms = dpolicy.max_interval; continue; } sb_start_intwrite(sbi->sb); issued = __issue_discard_cmd(sbi, &dpolicy); if (issued > 0) { __wait_all_discard_cmd(sbi, &dpolicy); wait_ms = dpolicy.min_interval; } else if (issued == -1) { wait_ms = f2fs_time_to_wait(sbi, DISCARD_TIME); if (!wait_ms) wait_ms = dpolicy.mid_interval; } else { wait_ms = dpolicy.max_interval; } if (!atomic_read(&dcc->discard_cmd_cnt)) wait_ms = dpolicy.max_interval; sb_end_intwrite(sbi->sb); } while (!kthread_should_stop()); return 0; } #ifdef CONFIG_BLK_DEV_ZONED static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { sector_t sector, nr_sects; block_t lblkstart = blkstart; int devi = 0; u64 remainder = 0; if (f2fs_is_multi_device(sbi)) { devi = f2fs_target_device_index(sbi, blkstart); if (blkstart < FDEV(devi).start_blk || blkstart > FDEV(devi).end_blk) { f2fs_err(sbi, "Invalid block %x", blkstart); return -EIO; } blkstart -= FDEV(devi).start_blk; } /* For sequential zones, reset the zone write pointer */ if (f2fs_blkz_is_seq(sbi, devi, blkstart)) { sector = SECTOR_FROM_BLOCK(blkstart); nr_sects = SECTOR_FROM_BLOCK(blklen); div64_u64_rem(sector, bdev_zone_sectors(bdev), &remainder); if (remainder || nr_sects != bdev_zone_sectors(bdev)) { f2fs_err(sbi, "(%d) %s: Unaligned zone reset attempted (block %x + %x)", devi, sbi->s_ndevs ? FDEV(devi).path : "", blkstart, blklen); return -EIO; } if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) { unsigned int nofs_flags; int ret; trace_f2fs_issue_reset_zone(bdev, blkstart); nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, sector, nr_sects); memalloc_nofs_restore(nofs_flags); return ret; } __queue_zone_reset_cmd(sbi, bdev, blkstart, lblkstart, blklen); return 0; } /* For conventional zones, use regular discard if supported */ __queue_discard_cmd(sbi, bdev, lblkstart, blklen); return 0; } #endif static int __issue_discard_async(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { #ifdef CONFIG_BLK_DEV_ZONED if (f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(bdev)) return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); #endif __queue_discard_cmd(sbi, bdev, blkstart, blklen); return 0; } static int f2fs_issue_discard(struct f2fs_sb_info *sbi, block_t blkstart, block_t blklen) { sector_t start = blkstart, len = 0; struct block_device *bdev; struct seg_entry *se; unsigned int offset; block_t i; int err = 0; bdev = f2fs_target_device(sbi, blkstart, NULL); for (i = blkstart; i < blkstart + blklen; i++, len++) { if (i != start) { struct block_device *bdev2 = f2fs_target_device(sbi, i, NULL); if (bdev2 != bdev) { err = __issue_discard_async(sbi, bdev, start, len); if (err) return err; bdev = bdev2; start = i; len = 0; } } se = get_seg_entry(sbi, GET_SEGNO(sbi, i)); offset = GET_BLKOFF_FROM_SEG0(sbi, i); if (f2fs_block_unit_discard(sbi) && !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; } if (len) err = __issue_discard_async(sbi, bdev, start, len); return err; } static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, bool check_only) { int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start); unsigned long *cur_map = (unsigned long *)se->cur_valid_map; unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; unsigned long *discard_map = (unsigned long *)se->discard_map; unsigned long *dmap = SIT_I(sbi)->tmp_map; unsigned int start = 0, end = -1; bool force = (cpc->reason & CP_DISCARD); struct discard_entry *de = NULL; struct list_head *head = &SM_I(sbi)->dcc_info->entry_list; int i; if (se->valid_blocks == BLKS_PER_SEG(sbi) || !f2fs_hw_support_discard(sbi) || !f2fs_block_unit_discard(sbi)) return false; if (!force) { if (!f2fs_realtime_discard_enable(sbi) || (!se->valid_blocks && !is_curseg(sbi, cpc->trim_start)) || SM_I(sbi)->dcc_info->nr_discards >= SM_I(sbi)->dcc_info->max_discards) return false; } /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */ for (i = 0; i < entries; i++) dmap[i] = force ? ~ckpt_map[i] & ~discard_map[i] : (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i]; while (force || SM_I(sbi)->dcc_info->nr_discards <= SM_I(sbi)->dcc_info->max_discards) { start = __find_rev_next_bit(dmap, BLKS_PER_SEG(sbi), end + 1); if (start >= BLKS_PER_SEG(sbi)) break; end = __find_rev_next_zero_bit(dmap, BLKS_PER_SEG(sbi), start + 1); if (force && start && end != BLKS_PER_SEG(sbi) && (end - start) < cpc->trim_minlen) continue; if (check_only) return true; if (!de) { de = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_F2FS_ZERO, true, NULL); de->start_blkaddr = START_BLOCK(sbi, cpc->trim_start); list_add_tail(&de->list, head); } for (i = start; i < end; i++) __set_bit_le(i, (void *)de->discard_map); SM_I(sbi)->dcc_info->nr_discards += end - start; } return false; } static void release_discard_addr(struct discard_entry *entry) { list_del(&entry->list); kmem_cache_free(discard_entry_slab, entry); } void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi) { struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list); struct discard_entry *entry, *this; /* drop caches */ list_for_each_entry_safe(entry, this, head, list) release_discard_addr(entry); } /* * Should call f2fs_clear_prefree_segments after checkpoint is done. */ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int segno; mutex_lock(&dirty_i->seglist_lock); for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi)) __set_test_and_free(sbi, segno, false); mutex_unlock(&dirty_i->seglist_lock); } void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *head = &dcc->entry_list; struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int start = 0, end = -1; unsigned int secno, start_segno; bool force = (cpc->reason & CP_DISCARD); bool section_alignment = F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION; if (f2fs_lfs_mode(sbi) && __is_large_section(sbi)) section_alignment = true; mutex_lock(&dirty_i->seglist_lock); while (1) { int i; if (section_alignment && end != -1) end--; start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1); if (start >= MAIN_SEGS(sbi)) break; end = find_next_zero_bit(prefree_map, MAIN_SEGS(sbi), start + 1); if (section_alignment) { start = rounddown(start, SEGS_PER_SEC(sbi)); end = roundup(end, SEGS_PER_SEC(sbi)); } for (i = start; i < end; i++) { if (test_and_clear_bit(i, prefree_map)) dirty_i->nr_dirty[PRE]--; } if (!f2fs_realtime_discard_enable(sbi)) continue; if (force && start >= cpc->trim_start && (end - 1) <= cpc->trim_end) continue; /* Should cover 2MB zoned device for zone-based reset */ if (!f2fs_sb_has_blkzoned(sbi) && (!f2fs_lfs_mode(sbi) || !__is_large_section(sbi))) { f2fs_issue_discard(sbi, START_BLOCK(sbi, start), SEGS_TO_BLKS(sbi, end - start)); continue; } next: secno = GET_SEC_FROM_SEG(sbi, start); start_segno = GET_SEG_FROM_SEC(sbi, secno); if (!is_cursec(sbi, secno) && !get_valid_blocks(sbi, start, true)) f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno), BLKS_PER_SEC(sbi)); start = start_segno + SEGS_PER_SEC(sbi); if (start < end) goto next; else end = start - 1; } mutex_unlock(&dirty_i->seglist_lock); if (!f2fs_block_unit_discard(sbi)) goto wakeup; /* send small discards */ list_for_each_entry_safe(entry, this, head, list) { unsigned int cur_pos = 0, next_pos, len, total_len = 0; bool is_valid = test_bit_le(0, entry->discard_map); find_next: if (is_valid) { next_pos = find_next_zero_bit_le(entry->discard_map, BLKS_PER_SEG(sbi), cur_pos); len = next_pos - cur_pos; if (f2fs_sb_has_blkzoned(sbi) || (force && len < cpc->trim_minlen)) goto skip; f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos, len); total_len += len; } else { next_pos = find_next_bit_le(entry->discard_map, BLKS_PER_SEG(sbi), cur_pos); } skip: cur_pos = next_pos; is_valid = !is_valid; if (cur_pos < BLKS_PER_SEG(sbi)) goto find_next; release_discard_addr(entry); dcc->nr_discards -= total_len; } wakeup: wake_up_discard_thread(sbi, false); } int f2fs_start_discard_thread(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; int err = 0; if (f2fs_sb_has_readonly(sbi)) { f2fs_info(sbi, "Skip to start discard thread for readonly image"); return 0; } if (!f2fs_realtime_discard_enable(sbi)) return 0; dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi, "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(dcc->f2fs_issue_discard)) { err = PTR_ERR(dcc->f2fs_issue_discard); dcc->f2fs_issue_discard = NULL; } return err; } static int create_discard_cmd_control(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc; int err = 0, i; if (SM_I(sbi)->dcc_info) { dcc = SM_I(sbi)->dcc_info; goto init_thread; } dcc = f2fs_kzalloc(sbi, sizeof(struct discard_cmd_control), GFP_KERNEL); if (!dcc) return -ENOMEM; dcc->discard_io_aware_gran = MAX_PLIST_NUM; dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY; dcc->discard_io_aware = DPOLICY_IO_AWARE_ENABLE; if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT || F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) dcc->discard_granularity = BLKS_PER_SEG(sbi); INIT_LIST_HEAD(&dcc->entry_list); for (i = 0; i < MAX_PLIST_NUM; i++) INIT_LIST_HEAD(&dcc->pend_list[i]); INIT_LIST_HEAD(&dcc->wait_list); INIT_LIST_HEAD(&dcc->fstrim_list); mutex_init(&dcc->cmd_lock); atomic_set(&dcc->issued_discard, 0); atomic_set(&dcc->queued_discard, 0); atomic_set(&dcc->discard_cmd_cnt, 0); dcc->nr_discards = 0; dcc->max_discards = SEGS_TO_BLKS(sbi, MAIN_SEGS(sbi)); dcc->max_discard_request = DEF_MAX_DISCARD_REQUEST; dcc->min_discard_issue_time = DEF_MIN_DISCARD_ISSUE_TIME; dcc->mid_discard_issue_time = DEF_MID_DISCARD_ISSUE_TIME; dcc->max_discard_issue_time = DEF_MAX_DISCARD_ISSUE_TIME; dcc->discard_urgent_util = DEF_DISCARD_URGENT_UTIL; dcc->undiscard_blks = 0; dcc->next_pos = 0; dcc->root = RB_ROOT_CACHED; dcc->rbtree_check = false; init_waitqueue_head(&dcc->discard_wait_queue); SM_I(sbi)->dcc_info = dcc; init_thread: err = f2fs_start_discard_thread(sbi); if (err) { kfree(dcc); SM_I(sbi)->dcc_info = NULL; } return err; } static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; if (!dcc) return; f2fs_stop_discard_thread(sbi); /* * Recovery can cache discard commands, so in error path of * fill_super(), it needs to give a chance to handle them. */ f2fs_issue_discard_timeout(sbi); kfree(dcc); SM_I(sbi)->dcc_info = NULL; } static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap)) { sit_i->dirty_sentries++; return false; } return true; } static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type, unsigned int segno, int modified) { struct seg_entry *se = get_seg_entry(sbi, segno); se->type = type; if (modified) __mark_sit_entry_dirty(sbi, segno); } static inline unsigned long long get_segment_mtime(struct f2fs_sb_info *sbi, block_t blkaddr) { unsigned int segno = GET_SEGNO(sbi, blkaddr); if (segno == NULL_SEGNO) return 0; return get_seg_entry(sbi, segno)->mtime; } static void update_segment_mtime(struct f2fs_sb_info *sbi, block_t blkaddr, unsigned long long old_mtime) { struct seg_entry *se; unsigned int segno = GET_SEGNO(sbi, blkaddr); unsigned long long ctime = get_mtime(sbi, false); unsigned long long mtime = old_mtime ? old_mtime : ctime; if (segno == NULL_SEGNO) return; se = get_seg_entry(sbi, segno); if (!se->mtime) se->mtime = mtime; else se->mtime = div_u64(se->mtime * se->valid_blocks + mtime, se->valid_blocks + 1); if (ctime > SIT_I(sbi)->max_mtime) SIT_I(sbi)->max_mtime = ctime; } /* * NOTE: when updating multiple blocks at the same time, please ensure * that the consecutive input blocks belong to the same segment. */ static int update_sit_entry_for_release(struct f2fs_sb_info *sbi, struct seg_entry *se, unsigned int segno, block_t blkaddr, unsigned int offset, int del) { bool exist; #ifdef CONFIG_F2FS_CHECK_FS bool mir_exist; #endif int i; int del_count = -del; f2fs_bug_on(sbi, GET_SEGNO(sbi, blkaddr) != GET_SEGNO(sbi, blkaddr + del_count - 1)); for (i = 0; i < del_count; i++) { exist = f2fs_test_and_clear_bit(offset + i, se->cur_valid_map); #ifdef CONFIG_F2FS_CHECK_FS mir_exist = f2fs_test_and_clear_bit(offset + i, se->cur_valid_map_mir); if (unlikely(exist != mir_exist)) { f2fs_err(sbi, "Inconsistent error when clearing bitmap, blk:%u, old bit:%d", blkaddr + i, exist); f2fs_bug_on(sbi, 1); } #endif if (unlikely(!exist)) { f2fs_err(sbi, "Bitmap was wrongly cleared, blk:%u", blkaddr + i); f2fs_bug_on(sbi, 1); se->valid_blocks++; del += 1; } else if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { /* * If checkpoints are off, we must not reuse data that * was used in the previous checkpoint. If it was used * before, we must track that to know how much space we * really have. */ if (f2fs_test_bit(offset + i, se->ckpt_valid_map)) { spin_lock(&sbi->stat_lock); sbi->unusable_block_count++; spin_unlock(&sbi->stat_lock); } } if (f2fs_block_unit_discard(sbi) && f2fs_test_and_clear_bit(offset + i, se->discard_map)) sbi->discard_blks++; if (!f2fs_test_bit(offset + i, se->ckpt_valid_map)) { se->ckpt_valid_blocks -= 1; if (__is_large_section(sbi)) get_sec_entry(sbi, segno)->ckpt_valid_blocks -= 1; } } if (__is_large_section(sbi)) sanity_check_valid_blocks(sbi, segno); return del; } static int update_sit_entry_for_alloc(struct f2fs_sb_info *sbi, struct seg_entry *se, unsigned int segno, block_t blkaddr, unsigned int offset, int del) { bool exist; #ifdef CONFIG_F2FS_CHECK_FS bool mir_exist; #endif exist = f2fs_test_and_set_bit(offset, se->cur_valid_map); #ifdef CONFIG_F2FS_CHECK_FS mir_exist = f2fs_test_and_set_bit(offset, se->cur_valid_map_mir); if (unlikely(exist != mir_exist)) { f2fs_err(sbi, "Inconsistent error when setting bitmap, blk:%u, old bit:%d", blkaddr, exist); f2fs_bug_on(sbi, 1); } #endif if (unlikely(exist)) { f2fs_err(sbi, "Bitmap was wrongly set, blk:%u", blkaddr); f2fs_bug_on(sbi, 1); se->valid_blocks--; del = 0; } if (f2fs_block_unit_discard(sbi) && !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; /* * SSR should never reuse block which is checkpointed * or newly invalidated. */ if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map)) { se->ckpt_valid_blocks++; if (__is_large_section(sbi)) get_sec_entry(sbi, segno)->ckpt_valid_blocks++; } } if (!f2fs_test_bit(offset, se->ckpt_valid_map)) { se->ckpt_valid_blocks += del; if (__is_large_section(sbi)) get_sec_entry(sbi, segno)->ckpt_valid_blocks += del; } if (__is_large_section(sbi)) sanity_check_valid_blocks(sbi, segno); return del; } /* * If releasing blocks, this function supports updating multiple consecutive blocks * at one time, but please note that these consecutive blocks need to belong to the * same segment. */ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) { struct seg_entry *se; unsigned int segno, offset; long int new_vblocks; segno = GET_SEGNO(sbi, blkaddr); if (segno == NULL_SEGNO) return; se = get_seg_entry(sbi, segno); new_vblocks = se->valid_blocks + del; offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); f2fs_bug_on(sbi, (new_vblocks < 0 || (new_vblocks > f2fs_usable_blks_in_seg(sbi, segno)))); se->valid_blocks = new_vblocks; /* Update valid block bitmap */ if (del > 0) { del = update_sit_entry_for_alloc(sbi, se, segno, blkaddr, offset, del); } else { del = update_sit_entry_for_release(sbi, se, segno, blkaddr, offset, del); } __mark_sit_entry_dirty(sbi, segno); /* update total number of valid blocks to be written in ckpt area */ SIT_I(sbi)->written_valid_blocks += del; if (__is_large_section(sbi)) get_sec_entry(sbi, segno)->valid_blocks += del; } void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr, unsigned int len) { unsigned int segno = GET_SEGNO(sbi, addr); struct sit_info *sit_i = SIT_I(sbi); block_t addr_start = addr, addr_end = addr + len - 1; unsigned int seg_num = GET_SEGNO(sbi, addr_end) - segno + 1; unsigned int i = 1, max_blocks = sbi->blocks_per_seg, cnt; f2fs_bug_on(sbi, addr == NULL_ADDR); if (addr == NEW_ADDR || addr == COMPRESS_ADDR) return; f2fs_invalidate_internal_cache(sbi, addr, len); /* add it into sit main buffer */ down_write(&sit_i->sentry_lock); if (seg_num == 1) cnt = len; else cnt = max_blocks - GET_BLKOFF_FROM_SEG0(sbi, addr); do { update_segment_mtime(sbi, addr_start, 0); update_sit_entry(sbi, addr_start, -cnt); /* add it into dirty seglist */ locate_dirty_segment(sbi, segno); /* update @addr_start and @cnt and @segno */ addr_start = START_BLOCK(sbi, ++segno); if (++i == seg_num) cnt = GET_BLKOFF_FROM_SEG0(sbi, addr_end) + 1; else cnt = max_blocks; } while (i <= seg_num); up_write(&sit_i->sentry_lock); } bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) { struct sit_info *sit_i = SIT_I(sbi); unsigned int segno, offset; struct seg_entry *se; bool is_cp = false; if (!__is_valid_data_blkaddr(blkaddr)) return true; down_read(&sit_i->sentry_lock); segno = GET_SEGNO(sbi, blkaddr); se = get_seg_entry(sbi, segno); offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); if (f2fs_test_bit(offset, se->ckpt_valid_map)) is_cp = true; up_read(&sit_i->sentry_lock); return is_cp; } static unsigned short f2fs_curseg_valid_blocks(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); if (sbi->ckpt->alloc_type[type] == SSR) return BLKS_PER_SEG(sbi); return curseg->next_blkoff; } /* * Calculate the number of current summary pages for writing */ int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) { int valid_sum_count = 0; int i, sum_in_page; for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { if (sbi->ckpt->alloc_type[i] != SSR && for_ra) valid_sum_count += le16_to_cpu(F2FS_CKPT(sbi)->cur_data_blkoff[i]); else valid_sum_count += f2fs_curseg_valid_blocks(sbi, i); } sum_in_page = (PAGE_SIZE - 2 * SUM_JOURNAL_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE; if (valid_sum_count <= sum_in_page) return 1; else if ((valid_sum_count - sum_in_page) <= (PAGE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE) return 2; return 3; } /* * Caller should put this summary folio */ struct folio *f2fs_get_sum_folio(struct f2fs_sb_info *sbi, unsigned int segno) { if (unlikely(f2fs_cp_error(sbi))) return ERR_PTR(-EIO); return f2fs_get_meta_folio_retry(sbi, GET_SUM_BLOCK(sbi, segno)); } void f2fs_update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr) { struct folio *folio; if (SUMS_PER_BLOCK == 1) folio = f2fs_grab_meta_folio(sbi, blk_addr); else folio = f2fs_get_meta_folio_retry(sbi, blk_addr); if (IS_ERR(folio)) return; memcpy(folio_address(folio), src, PAGE_SIZE); folio_mark_dirty(folio); f2fs_folio_put(folio, true); } static void write_sum_page(struct f2fs_sb_info *sbi, struct f2fs_summary_block *sum_blk, unsigned int segno) { struct folio *folio; if (SUMS_PER_BLOCK == 1) return f2fs_update_meta_page(sbi, (void *)sum_blk, GET_SUM_BLOCK(sbi, segno)); folio = f2fs_get_sum_folio(sbi, segno); if (IS_ERR(folio)) return; memcpy(SUM_BLK_PAGE_ADDR(folio, segno), sum_blk, sizeof(*sum_blk)); folio_mark_dirty(folio); f2fs_folio_put(folio, true); } static void write_current_sum_page(struct f2fs_sb_info *sbi, int type, block_t blk_addr) { struct curseg_info *curseg = CURSEG_I(sbi, type); struct folio *folio = f2fs_grab_meta_folio(sbi, blk_addr); struct f2fs_summary_block *src = curseg->sum_blk; struct f2fs_summary_block *dst; dst = folio_address(folio); memset(dst, 0, PAGE_SIZE); mutex_lock(&curseg->curseg_mutex); down_read(&curseg->journal_rwsem); memcpy(&dst->journal, curseg->journal, SUM_JOURNAL_SIZE); up_read(&curseg->journal_rwsem); memcpy(dst->entries, src->entries, SUM_ENTRY_SIZE); memcpy(&dst->footer, &src->footer, SUM_FOOTER_SIZE); mutex_unlock(&curseg->curseg_mutex); folio_mark_dirty(folio); f2fs_folio_put(folio, true); } static int is_next_segment_free(struct f2fs_sb_info *sbi, struct curseg_info *curseg) { unsigned int segno = curseg->segno + 1; struct free_segmap_info *free_i = FREE_I(sbi); if (segno < MAIN_SEGS(sbi) && segno % SEGS_PER_SEC(sbi)) return !test_bit(segno, free_i->free_segmap); return 0; } /* * Find a new segment from the free segments bitmap to right order * This function should be returned with success, otherwise BUG */ static int get_new_segment(struct f2fs_sb_info *sbi, unsigned int *newseg, bool new_sec, bool pinning) { struct free_segmap_info *free_i = FREE_I(sbi); unsigned int segno, secno, zoneno; unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone; unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg); unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg); unsigned int alloc_policy = sbi->allocate_section_policy; unsigned int alloc_hint = sbi->allocate_section_hint; bool init = true; int i; int ret = 0; spin_lock(&free_i->segmap_lock); if (time_to_inject(sbi, FAULT_NO_SEGMENT)) { ret = -ENOSPC; goto out_unlock; } if (!new_sec && ((*newseg + 1) % SEGS_PER_SEC(sbi))) { segno = find_next_zero_bit(free_i->free_segmap, GET_SEG_FROM_SEC(sbi, hint + 1), *newseg + 1); if (segno < GET_SEG_FROM_SEC(sbi, hint + 1)) goto got_it; } #ifdef CONFIG_BLK_DEV_ZONED /* * If we format f2fs on zoned storage, let's try to get pinned sections * from beginning of the storage, which should be a conventional one. */ if (f2fs_sb_has_blkzoned(sbi)) { /* Prioritize writing to conventional zones */ if (sbi->blkzone_alloc_policy == BLKZONE_ALLOC_PRIOR_CONV || pinning) segno = 0; else segno = max(sbi->first_seq_zone_segno, *newseg); hint = GET_SEC_FROM_SEG(sbi, segno); } #endif /* * Prevent allocate_section_hint from exceeding MAIN_SECS() * due to desynchronization. */ if (alloc_policy != ALLOCATE_FORWARD_NOHINT && alloc_hint > MAIN_SECS(sbi)) alloc_hint = MAIN_SECS(sbi); if (alloc_policy == ALLOCATE_FORWARD_FROM_HINT && hint < alloc_hint) hint = alloc_hint; else if (alloc_policy == ALLOCATE_FORWARD_WITHIN_HINT && hint >= alloc_hint) hint = 0; find_other_zone: secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint); #ifdef CONFIG_BLK_DEV_ZONED if (secno >= MAIN_SECS(sbi) && f2fs_sb_has_blkzoned(sbi)) { /* Write only to sequential zones */ if (sbi->blkzone_alloc_policy == BLKZONE_ALLOC_ONLY_SEQ) { hint = GET_SEC_FROM_SEG(sbi, sbi->first_seq_zone_segno); secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint); } else secno = find_first_zero_bit(free_i->free_secmap, MAIN_SECS(sbi)); if (secno >= MAIN_SECS(sbi)) { ret = -ENOSPC; f2fs_bug_on(sbi, 1); goto out_unlock; } } #endif if (secno >= MAIN_SECS(sbi)) { secno = find_first_zero_bit(free_i->free_secmap, MAIN_SECS(sbi)); if (secno >= MAIN_SECS(sbi)) { ret = -ENOSPC; f2fs_bug_on(sbi, !pinning); goto out_unlock; } } segno = GET_SEG_FROM_SEC(sbi, secno); zoneno = GET_ZONE_FROM_SEC(sbi, secno); /* give up on finding another zone */ if (!init) goto got_it; if (sbi->secs_per_zone == 1) goto got_it; if (zoneno == old_zoneno) goto got_it; for (i = 0; i < NR_CURSEG_TYPE; i++) if (CURSEG_I(sbi, i)->zone == zoneno) break; if (i < NR_CURSEG_TYPE) { /* zone is in user, try another */ if (zoneno + 1 >= total_zones) hint = 0; else hint = (zoneno + 1) * sbi->secs_per_zone; init = false; goto find_other_zone; } got_it: /* set it as dirty segment in free segmap */ if (test_bit(segno, free_i->free_segmap)) { ret = -EFSCORRUPTED; f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_CORRUPTED_FREE_BITMAP); goto out_unlock; } /* no free section in conventional device or conventional zone */ if (new_sec && pinning && f2fs_is_sequential_zone_area(sbi, START_BLOCK(sbi, segno))) { ret = -EAGAIN; goto out_unlock; } __set_inuse(sbi, segno); *newseg = segno; out_unlock: spin_unlock(&free_i->segmap_lock); if (ret == -ENOSPC && !pinning) f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_NO_SEGMENT); return ret; } static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) { struct curseg_info *curseg = CURSEG_I(sbi, type); struct summary_footer *sum_footer; unsigned short seg_type = curseg->seg_type; /* only happen when get_new_segment() fails */ if (curseg->next_segno == NULL_SEGNO) return; curseg->inited = true; curseg->segno = curseg->next_segno; curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno); curseg->next_blkoff = 0; curseg->next_segno = NULL_SEGNO; sum_footer = &(curseg->sum_blk->footer); memset(sum_footer, 0, sizeof(struct summary_footer)); sanity_check_seg_type(sbi, seg_type); if (IS_DATASEG(seg_type)) SET_SUM_TYPE(sum_footer, SUM_TYPE_DATA); if (IS_NODESEG(seg_type)) SET_SUM_TYPE(sum_footer, SUM_TYPE_NODE); __set_sit_entry_type(sbi, seg_type, curseg->segno, modified); } static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned short seg_type = curseg->seg_type; sanity_check_seg_type(sbi, seg_type); if (__is_large_section(sbi)) { if (f2fs_need_rand_seg(sbi)) { unsigned int hint = GET_SEC_FROM_SEG(sbi, curseg->segno); if (GET_SEC_FROM_SEG(sbi, curseg->segno + 1) != hint) return curseg->segno; return get_random_u32_inclusive(curseg->segno + 1, GET_SEG_FROM_SEC(sbi, hint + 1) - 1); } return curseg->segno; } else if (f2fs_need_rand_seg(sbi)) { return get_random_u32_below(MAIN_SECS(sbi) * SEGS_PER_SEC(sbi)); } /* inmem log may not locate on any segment after mount */ if (!curseg->inited) return 0; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return 0; if (seg_type == CURSEG_HOT_DATA || IS_NODESEG(seg_type)) return 0; if (SIT_I(sbi)->last_victim[ALLOC_NEXT]) return SIT_I(sbi)->last_victim[ALLOC_NEXT]; /* find segments from 0 to reuse freed segments */ if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE) return 0; return curseg->segno; } static void reset_curseg_fields(struct curseg_info *curseg) { curseg->inited = false; curseg->segno = NULL_SEGNO; curseg->next_segno = 0; } /* * Allocate a current working segment. * This function always allocates a free segment in LFS manner. */ static int new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) { struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int segno = curseg->segno; bool pinning = type == CURSEG_COLD_DATA_PINNED; int ret; if (curseg->inited) write_sum_page(sbi, curseg->sum_blk, segno); segno = __get_next_segno(sbi, type); ret = get_new_segment(sbi, &segno, new_sec, pinning); if (ret) { if (ret == -ENOSPC) reset_curseg_fields(curseg); return ret; } curseg->next_segno = segno; reset_curseg(sbi, type, 1); curseg->alloc_type = LFS; if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) curseg->fragment_remained_chunk = get_random_u32_inclusive(1, sbi->max_fragment_chunk); return 0; } static int __next_free_blkoff(struct f2fs_sb_info *sbi, int segno, block_t start) { struct seg_entry *se = get_seg_entry(sbi, segno); int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); unsigned long *target_map = SIT_I(sbi)->tmp_map; unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; unsigned long *cur_map = (unsigned long *)se->cur_valid_map; int i; for (i = 0; i < entries; i++) target_map[i] = ckpt_map[i] | cur_map[i]; return __find_rev_next_zero_bit(target_map, BLKS_PER_SEG(sbi), start); } static int f2fs_find_next_ssr_block(struct f2fs_sb_info *sbi, struct curseg_info *seg) { return __next_free_blkoff(sbi, seg->segno, seg->next_blkoff + 1); } bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno) { return __next_free_blkoff(sbi, segno, 0) < BLKS_PER_SEG(sbi); } /* * This function always allocates a used segment(from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks */ static int change_curseg(struct f2fs_sb_info *sbi, int type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int new_segno = curseg->next_segno; struct f2fs_summary_block *sum_node; struct folio *sum_folio; if (curseg->inited) write_sum_page(sbi, curseg->sum_blk, curseg->segno); __set_test_and_inuse(sbi, new_segno); mutex_lock(&dirty_i->seglist_lock); __remove_dirty_segment(sbi, new_segno, PRE); __remove_dirty_segment(sbi, new_segno, DIRTY); mutex_unlock(&dirty_i->seglist_lock); reset_curseg(sbi, type, 1); curseg->alloc_type = SSR; curseg->next_blkoff = __next_free_blkoff(sbi, curseg->segno, 0); sum_folio = f2fs_get_sum_folio(sbi, new_segno); if (IS_ERR(sum_folio)) { /* GC won't be able to use stale summary pages by cp_error */ memset(curseg->sum_blk, 0, SUM_ENTRY_SIZE); return PTR_ERR(sum_folio); } sum_node = SUM_BLK_PAGE_ADDR(sum_folio, new_segno); memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); f2fs_folio_put(sum_folio, true); return 0; } static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, int alloc_mode, unsigned long long age); static int get_atssr_segment(struct f2fs_sb_info *sbi, int type, int target_type, int alloc_mode, unsigned long long age) { struct curseg_info *curseg = CURSEG_I(sbi, type); int ret = 0; curseg->seg_type = target_type; if (get_ssr_segment(sbi, type, alloc_mode, age)) { struct seg_entry *se = get_seg_entry(sbi, curseg->next_segno); curseg->seg_type = se->type; ret = change_curseg(sbi, type); } else { /* allocate cold segment by default */ curseg->seg_type = CURSEG_COLD_DATA; ret = new_curseg(sbi, type, true); } stat_inc_seg_type(sbi, curseg); return ret; } static int __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi, bool force) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC); int ret = 0; if (!sbi->am.atgc_enabled && !force) return 0; f2fs_down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); down_write(&SIT_I(sbi)->sentry_lock); ret = get_atssr_segment(sbi, CURSEG_ALL_DATA_ATGC, CURSEG_COLD_DATA, SSR, 0); up_write(&SIT_I(sbi)->sentry_lock); mutex_unlock(&curseg->curseg_mutex); f2fs_up_read(&SM_I(sbi)->curseg_lock); return ret; } int f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi) { return __f2fs_init_atgc_curseg(sbi, false); } int f2fs_reinit_atgc_curseg(struct f2fs_sb_info *sbi) { int ret; if (!test_opt(sbi, ATGC)) return 0; if (sbi->am.atgc_enabled) return 0; if (le64_to_cpu(F2FS_CKPT(sbi)->elapsed_time) < sbi->am.age_threshold) return 0; ret = __f2fs_init_atgc_curseg(sbi, true); if (!ret) { sbi->am.atgc_enabled = true; f2fs_info(sbi, "reenabled age threshold GC"); } return ret; } static void __f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); if (!curseg->inited) goto out; if (get_valid_blocks(sbi, curseg->segno, false)) { write_sum_page(sbi, curseg->sum_blk, curseg->segno); } else { mutex_lock(&DIRTY_I(sbi)->seglist_lock); __set_test_and_free(sbi, curseg->segno, true); mutex_unlock(&DIRTY_I(sbi)->seglist_lock); } out: mutex_unlock(&curseg->curseg_mutex); } void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi) { __f2fs_save_inmem_curseg(sbi, CURSEG_COLD_DATA_PINNED); if (sbi->am.atgc_enabled) __f2fs_save_inmem_curseg(sbi, CURSEG_ALL_DATA_ATGC); } static void __f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); if (!curseg->inited) goto out; if (get_valid_blocks(sbi, curseg->segno, false)) goto out; mutex_lock(&DIRTY_I(sbi)->seglist_lock); __set_test_and_inuse(sbi, curseg->segno); mutex_unlock(&DIRTY_I(sbi)->seglist_lock); out: mutex_unlock(&curseg->curseg_mutex); } void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi) { __f2fs_restore_inmem_curseg(sbi, CURSEG_COLD_DATA_PINNED); if (sbi->am.atgc_enabled) __f2fs_restore_inmem_curseg(sbi, CURSEG_ALL_DATA_ATGC); } static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, int alloc_mode, unsigned long long age) { struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned segno = NULL_SEGNO; unsigned short seg_type = curseg->seg_type; int i, cnt; bool reversed = false; sanity_check_seg_type(sbi, seg_type); /* f2fs_need_SSR() already forces to do this */ if (!f2fs_get_victim(sbi, &segno, BG_GC, seg_type, alloc_mode, age, false)) { curseg->next_segno = segno; return 1; } /* For node segments, let's do SSR more intensively */ if (IS_NODESEG(seg_type)) { if (seg_type >= CURSEG_WARM_NODE) { reversed = true; i = CURSEG_COLD_NODE; } else { i = CURSEG_HOT_NODE; } cnt = NR_CURSEG_NODE_TYPE; } else { if (seg_type >= CURSEG_WARM_DATA) { reversed = true; i = CURSEG_COLD_DATA; } else { i = CURSEG_HOT_DATA; } cnt = NR_CURSEG_DATA_TYPE; } for (; cnt-- > 0; reversed ? i-- : i++) { if (i == seg_type) continue; if (!f2fs_get_victim(sbi, &segno, BG_GC, i, alloc_mode, age, false)) { curseg->next_segno = segno; return 1; } } /* find valid_blocks=0 in dirty list */ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { segno = get_free_segment(sbi); if (segno != NULL_SEGNO) { curseg->next_segno = segno; return 1; } } return 0; } static bool need_new_seg(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && curseg->seg_type == CURSEG_WARM_NODE) return true; if (curseg->alloc_type == LFS && is_next_segment_free(sbi, curseg) && likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return true; if (!f2fs_need_SSR(sbi) || !get_ssr_segment(sbi, type, SSR, 0)) return true; return false; } int f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, unsigned int start, unsigned int end) { struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int segno; int ret = 0; f2fs_down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); down_write(&SIT_I(sbi)->sentry_lock); segno = CURSEG_I(sbi, type)->segno; if (segno < start || segno > end) goto unlock; if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type, SSR, 0)) ret = change_curseg(sbi, type); else ret = new_curseg(sbi, type, true); stat_inc_seg_type(sbi, curseg); locate_dirty_segment(sbi, segno); unlock: up_write(&SIT_I(sbi)->sentry_lock); if (segno != curseg->segno) f2fs_notice(sbi, "For resize: curseg of type %d: %u ==> %u", type, segno, curseg->segno); mutex_unlock(&curseg->curseg_mutex); f2fs_up_read(&SM_I(sbi)->curseg_lock); return ret; } static int __allocate_new_segment(struct f2fs_sb_info *sbi, int type, bool new_sec, bool force) { struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int old_segno; int err = 0; if (type == CURSEG_COLD_DATA_PINNED && !curseg->inited) goto allocate; if (!force && curseg->inited && !curseg->next_blkoff && !get_valid_blocks(sbi, curseg->segno, new_sec) && !get_ckpt_valid_blocks(sbi, curseg->segno, new_sec)) return 0; allocate: old_segno = curseg->segno; err = new_curseg(sbi, type, true); if (err) return err; stat_inc_seg_type(sbi, curseg); locate_dirty_segment(sbi, old_segno); return 0; } int f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force) { int ret; f2fs_down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); ret = __allocate_new_segment(sbi, type, true, force); up_write(&SIT_I(sbi)->sentry_lock); f2fs_up_read(&SM_I(sbi)->curseg_lock); return ret; } int f2fs_allocate_pinning_section(struct f2fs_sb_info *sbi) { int err; bool gc_required = true; retry: f2fs_lock_op(sbi); err = f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false); f2fs_unlock_op(sbi); if (f2fs_sb_has_blkzoned(sbi) && err == -EAGAIN && gc_required) { f2fs_down_write(&sbi->gc_lock); err = f2fs_gc_range(sbi, 0, sbi->first_seq_zone_segno - 1, true, ZONED_PIN_SEC_REQUIRED_COUNT); f2fs_up_write(&sbi->gc_lock); gc_required = false; if (!err) goto retry; } return err; } int f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) { int i; int err = 0; f2fs_down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) err += __allocate_new_segment(sbi, i, false, false); up_write(&SIT_I(sbi)->sentry_lock); f2fs_up_read(&SM_I(sbi)->curseg_lock); return err; } bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) { __u64 trim_start = cpc->trim_start; bool has_candidate = false; down_write(&SIT_I(sbi)->sentry_lock); for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) { if (add_discard_addrs(sbi, cpc, true)) { has_candidate = true; break; } } up_write(&SIT_I(sbi)->sentry_lock); cpc->trim_start = trim_start; return has_candidate; } static unsigned int __issue_discard_cmd_range(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, unsigned int start, unsigned int end) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_cmd *prev_dc = NULL, *next_dc = NULL; struct rb_node **insert_p = NULL, *insert_parent = NULL; struct discard_cmd *dc; struct blk_plug plug; int issued; unsigned int trimmed = 0; next: issued = 0; mutex_lock(&dcc->cmd_lock); if (unlikely(dcc->rbtree_check)) f2fs_bug_on(sbi, !f2fs_check_discard_tree(sbi)); dc = __lookup_discard_cmd_ret(&dcc->root, start, &prev_dc, &next_dc, &insert_p, &insert_parent); if (!dc) dc = next_dc; blk_start_plug(&plug); while (dc && dc->di.lstart <= end) { struct rb_node *node; int err = 0; if (dc->di.len < dpolicy->granularity) goto skip; if (dc->state != D_PREP) { list_move_tail(&dc->list, &dcc->fstrim_list); goto skip; } err = __submit_discard_cmd(sbi, dpolicy, dc, &issued); if (issued >= dpolicy->max_requests) { start = dc->di.lstart + dc->di.len; if (err) __remove_discard_cmd(sbi, dc); blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); trimmed += __wait_all_discard_cmd(sbi, NULL); f2fs_schedule_timeout(DEFAULT_DISCARD_INTERVAL); goto next; } skip: node = rb_next(&dc->rb_node); if (err) __remove_discard_cmd(sbi, dc); dc = rb_entry_safe(node, struct discard_cmd, rb_node); if (fatal_signal_pending(current)) break; } blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); return trimmed; } int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) { __u64 start = F2FS_BYTES_TO_BLK(range->start); __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1; unsigned int start_segno, end_segno; block_t start_block, end_block; struct cp_control cpc; struct discard_policy dpolicy; unsigned long long trimmed = 0; int err = 0; bool need_align = f2fs_lfs_mode(sbi) && __is_large_section(sbi); if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) return -EINVAL; if (end < MAIN_BLKADDR(sbi)) goto out; if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { f2fs_warn(sbi, "Found FS corruption, run fsck to fix."); return -EFSCORRUPTED; } /* start/end segment number in main_area */ start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start); end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : GET_SEGNO(sbi, end); if (need_align) { start_segno = rounddown(start_segno, SEGS_PER_SEC(sbi)); end_segno = roundup(end_segno + 1, SEGS_PER_SEC(sbi)) - 1; } cpc.reason = CP_DISCARD; cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen)); cpc.trim_start = start_segno; cpc.trim_end = end_segno; if (sbi->discard_blks == 0) goto out; f2fs_down_write(&sbi->gc_lock); stat_inc_cp_call_count(sbi, TOTAL_CALL); err = f2fs_write_checkpoint(sbi, &cpc); f2fs_up_write(&sbi->gc_lock); if (err) goto out; /* * We filed discard candidates, but actually we don't need to wait for * all of them, since they'll be issued in idle time along with runtime * discard option. User configuration looks like using runtime discard * or periodic fstrim instead of it. */ if (f2fs_realtime_discard_enable(sbi)) goto out; start_block = START_BLOCK(sbi, start_segno); end_block = START_BLOCK(sbi, end_segno + 1); __init_discard_policy(sbi, &dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); trimmed = __issue_discard_cmd_range(sbi, &dpolicy, start_block, end_block); trimmed += __wait_discard_cmd_range(sbi, &dpolicy, start_block, end_block); out: if (!err) range->len = F2FS_BLK_TO_BYTES(trimmed); return err; } int f2fs_rw_hint_to_seg_type(struct f2fs_sb_info *sbi, enum rw_hint hint) { if (F2FS_OPTION(sbi).active_logs == 2) return CURSEG_HOT_DATA; else if (F2FS_OPTION(sbi).active_logs == 4) return CURSEG_COLD_DATA; /* active_log == 6 */ switch (hint) { case WRITE_LIFE_SHORT: return CURSEG_HOT_DATA; case WRITE_LIFE_EXTREME: return CURSEG_COLD_DATA; default: return CURSEG_WARM_DATA; } } /* * This returns write hints for each segment type. This hints will be * passed down to block layer as below by default. * * User F2FS Block * ---- ---- ----- * META WRITE_LIFE_NONE|REQ_META * HOT_NODE WRITE_LIFE_NONE * WARM_NODE WRITE_LIFE_MEDIUM * COLD_NODE WRITE_LIFE_LONG * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME * extension list " " * * -- buffered io * COLD_DATA WRITE_LIFE_EXTREME * HOT_DATA WRITE_LIFE_SHORT * WARM_DATA WRITE_LIFE_NOT_SET * * -- direct io * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET * WRITE_LIFE_NONE " WRITE_LIFE_NONE * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM * WRITE_LIFE_LONG " WRITE_LIFE_LONG */ enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, enum page_type type, enum temp_type temp) { switch (type) { case DATA: switch (temp) { case WARM: return WRITE_LIFE_NOT_SET; case HOT: return WRITE_LIFE_SHORT; case COLD: return WRITE_LIFE_EXTREME; default: return WRITE_LIFE_NONE; } case NODE: switch (temp) { case WARM: return WRITE_LIFE_MEDIUM; case HOT: return WRITE_LIFE_NONE; case COLD: return WRITE_LIFE_LONG; default: return WRITE_LIFE_NONE; } case META: return WRITE_LIFE_NONE; default: return WRITE_LIFE_NONE; } } static int __get_segment_type_2(struct f2fs_io_info *fio) { if (fio->type == DATA) return CURSEG_HOT_DATA; else return CURSEG_HOT_NODE; } static int __get_segment_type_4(struct f2fs_io_info *fio) { if (fio->type == DATA) { struct inode *inode = fio_inode(fio); if (S_ISDIR(inode->i_mode)) return CURSEG_HOT_DATA; else return CURSEG_COLD_DATA; } else { if (IS_DNODE(fio->folio) && is_cold_node(fio->folio)) return CURSEG_WARM_NODE; else return CURSEG_COLD_NODE; } } static int __get_age_segment_type(struct inode *inode, pgoff_t pgofs) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_info ei = {}; if (f2fs_lookup_age_extent_cache(inode, pgofs, &ei)) { if (!ei.age) return NO_CHECK_TYPE; if (ei.age <= sbi->hot_data_age_threshold) return CURSEG_HOT_DATA; if (ei.age <= sbi->warm_data_age_threshold) return CURSEG_WARM_DATA; return CURSEG_COLD_DATA; } return NO_CHECK_TYPE; } static int __get_segment_type_6(struct f2fs_io_info *fio) { if (fio->type == DATA) { struct inode *inode = fio_inode(fio); int type; if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) return CURSEG_COLD_DATA_PINNED; if (page_private_gcing(fio->page)) { if (fio->sbi->am.atgc_enabled && (fio->io_type == FS_DATA_IO) && (fio->sbi->gc_mode != GC_URGENT_HIGH) && __is_valid_data_blkaddr(fio->old_blkaddr) && !is_inode_flag_set(inode, FI_OPU_WRITE)) return CURSEG_ALL_DATA_ATGC; else return CURSEG_COLD_DATA; } if (file_is_cold(inode) || f2fs_need_compress_data(inode)) return CURSEG_COLD_DATA; type = __get_age_segment_type(inode, fio->folio->index); if (type != NO_CHECK_TYPE) return type; if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || f2fs_is_cow_file(inode) || is_inode_flag_set(inode, FI_NEED_IPU)) return CURSEG_HOT_DATA; return f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode), inode->i_write_hint); } else { if (IS_DNODE(fio->folio)) return is_cold_node(fio->folio) ? CURSEG_WARM_NODE : CURSEG_HOT_NODE; return CURSEG_COLD_NODE; } } enum temp_type f2fs_get_segment_temp(struct f2fs_sb_info *sbi, enum log_type type) { struct curseg_info *curseg = CURSEG_I(sbi, type); enum temp_type temp = COLD; switch (curseg->seg_type) { case CURSEG_HOT_NODE: case CURSEG_HOT_DATA: temp = HOT; break; case CURSEG_WARM_NODE: case CURSEG_WARM_DATA: temp = WARM; break; case CURSEG_COLD_NODE: case CURSEG_COLD_DATA: temp = COLD; break; default: f2fs_bug_on(sbi, 1); } return temp; } static int __get_segment_type(struct f2fs_io_info *fio) { enum log_type type = CURSEG_HOT_DATA; switch (F2FS_OPTION(fio->sbi).active_logs) { case 2: type = __get_segment_type_2(fio); break; case 4: type = __get_segment_type_4(fio); break; case 6: type = __get_segment_type_6(fio); break; default: f2fs_bug_on(fio->sbi, true); } fio->temp = f2fs_get_segment_temp(fio->sbi, type); return type; } static void f2fs_randomize_chunk(struct f2fs_sb_info *sbi, struct curseg_info *seg) { /* To allocate block chunks in different sizes, use random number */ if (--seg->fragment_remained_chunk > 0) return; seg->fragment_remained_chunk = get_random_u32_inclusive(1, sbi->max_fragment_chunk); seg->next_blkoff += get_random_u32_inclusive(1, sbi->max_fragment_hole); } int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct folio *folio, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, struct f2fs_io_info *fio) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned long long old_mtime; bool from_gc = (type == CURSEG_ALL_DATA_ATGC); struct seg_entry *se = NULL; bool segment_full = false; int ret = 0; f2fs_down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); down_write(&sit_i->sentry_lock); if (curseg->segno == NULL_SEGNO) { ret = -ENOSPC; goto out_err; } if (from_gc) { f2fs_bug_on(sbi, GET_SEGNO(sbi, old_blkaddr) == NULL_SEGNO); se = get_seg_entry(sbi, GET_SEGNO(sbi, old_blkaddr)); sanity_check_seg_type(sbi, se->type); f2fs_bug_on(sbi, IS_NODESEG(se->type)); } *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); f2fs_bug_on(sbi, curseg->next_blkoff >= BLKS_PER_SEG(sbi)); f2fs_wait_discard_bio(sbi, *new_blkaddr); curseg->sum_blk->entries[curseg->next_blkoff] = *sum; if (curseg->alloc_type == SSR) { curseg->next_blkoff = f2fs_find_next_ssr_block(sbi, curseg); } else { curseg->next_blkoff++; if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) f2fs_randomize_chunk(sbi, curseg); } if (curseg->next_blkoff >= f2fs_usable_blks_in_seg(sbi, curseg->segno)) segment_full = true; stat_inc_block_count(sbi, curseg); if (from_gc) { old_mtime = get_segment_mtime(sbi, old_blkaddr); } else { update_segment_mtime(sbi, old_blkaddr, 0); old_mtime = 0; } update_segment_mtime(sbi, *new_blkaddr, old_mtime); /* * SIT information should be updated before segment allocation, * since SSR needs latest valid block information. */ update_sit_entry(sbi, *new_blkaddr, 1); update_sit_entry(sbi, old_blkaddr, -1); /* * If the current segment is full, flush it out and replace it with a * new segment. */ if (segment_full) { if (type == CURSEG_COLD_DATA_PINNED && !((curseg->segno + 1) % sbi->segs_per_sec)) { write_sum_page(sbi, curseg->sum_blk, curseg->segno); reset_curseg_fields(curseg); goto skip_new_segment; } if (from_gc) { ret = get_atssr_segment(sbi, type, se->type, AT_SSR, se->mtime); } else { if (need_new_seg(sbi, type)) ret = new_curseg(sbi, type, false); else ret = change_curseg(sbi, type); stat_inc_seg_type(sbi, curseg); } if (ret) goto out_err; } skip_new_segment: /* * segment dirty status should be updated after segment allocation, * so we just need to update status only one time after previous * segment being closed. */ locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr)); if (IS_DATASEG(curseg->seg_type)) { unsigned long long new_val; new_val = atomic64_inc_return(&sbi->allocated_data_blocks); if (unlikely(new_val == ULLONG_MAX)) atomic64_set(&sbi->allocated_data_blocks, 0); } up_write(&sit_i->sentry_lock); if (folio && IS_NODESEG(curseg->seg_type)) { fill_node_footer_blkaddr(folio, NEXT_FREE_BLKADDR(sbi, curseg)); f2fs_inode_chksum_set(sbi, folio); } if (fio) { struct f2fs_bio_info *io; INIT_LIST_HEAD(&fio->list); fio->in_list = 1; io = sbi->write_io[fio->type] + fio->temp; spin_lock(&io->io_lock); list_add_tail(&fio->list, &io->io_list); spin_unlock(&io->io_lock); } mutex_unlock(&curseg->curseg_mutex); f2fs_up_read(&SM_I(sbi)->curseg_lock); return 0; out_err: *new_blkaddr = NULL_ADDR; up_write(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); f2fs_up_read(&SM_I(sbi)->curseg_lock); return ret; } void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino, block_t blkaddr, unsigned int blkcnt) { if (!f2fs_is_multi_device(sbi)) return; while (1) { unsigned int devidx = f2fs_target_device_index(sbi, blkaddr); unsigned int blks = FDEV(devidx).end_blk - blkaddr + 1; /* update device state for fsync */ f2fs_set_dirty_device(sbi, ino, devidx, FLUSH_INO); /* update device state for checkpoint */ if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) { spin_lock(&sbi->dev_lock); f2fs_set_bit(devidx, (char *)&sbi->dirty_device); spin_unlock(&sbi->dev_lock); } if (blkcnt <= blks) break; blkcnt -= blks; blkaddr += blks; } } static int log_type_to_seg_type(enum log_type type) { int seg_type = CURSEG_COLD_DATA; switch (type) { case CURSEG_HOT_DATA: case CURSEG_WARM_DATA: case CURSEG_COLD_DATA: case CURSEG_HOT_NODE: case CURSEG_WARM_NODE: case CURSEG_COLD_NODE: seg_type = (int)type; break; case CURSEG_COLD_DATA_PINNED: case CURSEG_ALL_DATA_ATGC: seg_type = CURSEG_COLD_DATA; break; default: break; } return seg_type; } static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { struct folio *folio = fio->folio; enum log_type type = __get_segment_type(fio); int seg_type = log_type_to_seg_type(type); bool keep_order = (f2fs_lfs_mode(fio->sbi) && seg_type == CURSEG_COLD_DATA); int err; if (keep_order) f2fs_down_read(&fio->sbi->io_order_lock); err = f2fs_allocate_data_block(fio->sbi, folio, fio->old_blkaddr, &fio->new_blkaddr, sum, type, fio); if (unlikely(err)) { f2fs_err_ratelimited(fio->sbi, "%s Failed to allocate data block, ino:%u, index:%lu, type:%d, old_blkaddr:0x%x, new_blkaddr:0x%x, err:%d", __func__, fio->ino, folio->index, type, fio->old_blkaddr, fio->new_blkaddr, err); if (fscrypt_inode_uses_fs_layer_crypto(folio->mapping->host)) fscrypt_finalize_bounce_page(&fio->encrypted_page); folio_end_writeback(folio); if (f2fs_in_warm_node_list(fio->sbi, folio)) f2fs_del_fsync_node_entry(fio->sbi, folio); f2fs_bug_on(fio->sbi, !is_set_ckpt_flags(fio->sbi, CP_ERROR_FLAG)); goto out; } f2fs_bug_on(fio->sbi, !f2fs_is_valid_blkaddr_raw(fio->sbi, fio->new_blkaddr, DATA_GENERIC_ENHANCE)); if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) f2fs_invalidate_internal_cache(fio->sbi, fio->old_blkaddr, 1); /* writeout dirty page into bdev */ f2fs_submit_page_write(fio); f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1); out: if (keep_order) f2fs_up_read(&fio->sbi->io_order_lock); } void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct folio *folio, enum iostat_type io_type) { struct f2fs_io_info fio = { .sbi = sbi, .type = META, .temp = HOT, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC | REQ_META | REQ_PRIO, .old_blkaddr = folio->index, .new_blkaddr = folio->index, .folio = folio, .encrypted_page = NULL, .in_list = 0, }; if (unlikely(folio->index >= MAIN_BLKADDR(sbi))) fio.op_flags &= ~REQ_META; folio_start_writeback(folio); f2fs_submit_page_write(&fio); stat_inc_meta_count(sbi, folio->index); f2fs_update_iostat(sbi, NULL, io_type, F2FS_BLKSIZE); } void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio) { struct f2fs_summary sum; set_summary(&sum, nid, 0, 0); do_write_page(&sum, fio); f2fs_update_iostat(fio->sbi, NULL, fio->io_type, F2FS_BLKSIZE); } void f2fs_outplace_write_data(struct dnode_of_data *dn, struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; struct f2fs_summary sum; f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR); if (fio->io_type == FS_DATA_IO || fio->io_type == FS_CP_DATA_IO) f2fs_update_age_extent_cache(dn); set_summary(&sum, dn->nid, dn->ofs_in_node, fio->version); do_write_page(&sum, fio); f2fs_update_data_blkaddr(dn, fio->new_blkaddr); f2fs_update_iostat(sbi, dn->inode, fio->io_type, F2FS_BLKSIZE); } int f2fs_inplace_write_data(struct f2fs_io_info *fio) { int err; struct f2fs_sb_info *sbi = fio->sbi; unsigned int segno; fio->new_blkaddr = fio->old_blkaddr; /* i/o temperature is needed for passing down write hints */ __get_segment_type(fio); segno = GET_SEGNO(sbi, fio->new_blkaddr); if (!IS_DATASEG(get_seg_entry(sbi, segno)->type)) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: incorrect segment(%u) type, run fsck to fix.", __func__, segno); err = -EFSCORRUPTED; f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE); goto drop_bio; } if (f2fs_cp_error(sbi)) { err = -EIO; goto drop_bio; } if (fio->meta_gc) f2fs_truncate_meta_inode_pages(sbi, fio->new_blkaddr, 1); stat_inc_inplace_blocks(fio->sbi); if (fio->bio && !IS_F2FS_IPU_NOCACHE(sbi)) err = f2fs_merge_page_bio(fio); else err = f2fs_submit_page_bio(fio); if (!err) { f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1); f2fs_update_iostat(fio->sbi, fio_inode(fio), fio->io_type, F2FS_BLKSIZE); } return err; drop_bio: if (fio->bio && *(fio->bio)) { struct bio *bio = *(fio->bio); bio->bi_status = BLK_STS_IOERR; bio_endio(bio); *(fio->bio) = NULL; } return err; } static inline int __f2fs_get_curseg(struct f2fs_sb_info *sbi, unsigned int segno) { int i; for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) { if (CURSEG_I(sbi, i)->segno == segno) break; } return i; } void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, bool recover_curseg, bool recover_newaddr, bool from_gc) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg; unsigned int segno, old_cursegno; struct seg_entry *se; int type; unsigned short old_blkoff; unsigned char old_alloc_type; segno = GET_SEGNO(sbi, new_blkaddr); se = get_seg_entry(sbi, segno); type = se->type; f2fs_down_write(&SM_I(sbi)->curseg_lock); if (!recover_curseg) { /* for recovery flow */ if (se->valid_blocks == 0 && !is_curseg(sbi, segno)) { if (old_blkaddr == NULL_ADDR) type = CURSEG_COLD_DATA; else type = CURSEG_WARM_DATA; } } else { if (is_curseg(sbi, segno)) { /* se->type is volatile as SSR allocation */ type = __f2fs_get_curseg(sbi, segno); f2fs_bug_on(sbi, type == NO_CHECK_TYPE); } else { type = CURSEG_WARM_DATA; } } curseg = CURSEG_I(sbi, type); f2fs_bug_on(sbi, !IS_DATASEG(curseg->seg_type)); mutex_lock(&curseg->curseg_mutex); down_write(&sit_i->sentry_lock); old_cursegno = curseg->segno; old_blkoff = curseg->next_blkoff; old_alloc_type = curseg->alloc_type; /* change the current segment */ if (segno != curseg->segno) { curseg->next_segno = segno; if (change_curseg(sbi, type)) goto out_unlock; } curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); curseg->sum_blk->entries[curseg->next_blkoff] = *sum; if (!recover_curseg || recover_newaddr) { if (!from_gc) update_segment_mtime(sbi, new_blkaddr, 0); update_sit_entry(sbi, new_blkaddr, 1); } if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) { f2fs_invalidate_internal_cache(sbi, old_blkaddr, 1); if (!from_gc) update_segment_mtime(sbi, old_blkaddr, 0); update_sit_entry(sbi, old_blkaddr, -1); } locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); locate_dirty_segment(sbi, GET_SEGNO(sbi, new_blkaddr)); locate_dirty_segment(sbi, old_cursegno); if (recover_curseg) { if (old_cursegno != curseg->segno) { curseg->next_segno = old_cursegno; if (change_curseg(sbi, type)) goto out_unlock; } curseg->next_blkoff = old_blkoff; curseg->alloc_type = old_alloc_type; } out_unlock: up_write(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); f2fs_up_write(&SM_I(sbi)->curseg_lock); } void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, block_t old_addr, block_t new_addr, unsigned char version, bool recover_curseg, bool recover_newaddr) { struct f2fs_summary sum; set_summary(&sum, dn->nid, dn->ofs_in_node, version); f2fs_do_replace_block(sbi, &sum, old_addr, new_addr, recover_curseg, recover_newaddr, false); f2fs_update_data_blkaddr(dn, new_addr); } void f2fs_folio_wait_writeback(struct folio *folio, enum page_type type, bool ordered, bool locked) { if (folio_test_writeback(folio)) { struct f2fs_sb_info *sbi = F2FS_F_SB(folio); /* submit cached LFS IO */ f2fs_submit_merged_write_cond(sbi, NULL, folio, 0, type); /* submit cached IPU IO */ f2fs_submit_merged_ipu_write(sbi, NULL, folio); if (ordered) { folio_wait_writeback(folio); f2fs_bug_on(sbi, locked && folio_test_writeback(folio)); } else { folio_wait_stable(folio); } } } void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct folio *cfolio; if (!f2fs_meta_inode_gc_required(inode)) return; if (!__is_valid_data_blkaddr(blkaddr)) return; cfolio = filemap_lock_folio(META_MAPPING(sbi), blkaddr); if (!IS_ERR(cfolio)) { f2fs_folio_wait_writeback(cfolio, DATA, true, true); f2fs_folio_put(cfolio, true); } } void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr, block_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); block_t i; if (!f2fs_meta_inode_gc_required(inode)) return; for (i = 0; i < len; i++) f2fs_wait_on_block_writeback(inode, blkaddr + i); f2fs_truncate_meta_inode_pages(sbi, blkaddr, len); } static int read_compacted_summaries(struct f2fs_sb_info *sbi) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct curseg_info *seg_i; unsigned char *kaddr; struct folio *folio; block_t start; int i, j, offset; start = start_sum_block(sbi); folio = f2fs_get_meta_folio(sbi, start++); if (IS_ERR(folio)) return PTR_ERR(folio); kaddr = folio_address(folio); /* Step 1: restore nat cache */ seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); memcpy(seg_i->journal, kaddr, SUM_JOURNAL_SIZE); /* Step 2: restore sit cache */ seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); memcpy(seg_i->journal, kaddr + SUM_JOURNAL_SIZE, SUM_JOURNAL_SIZE); offset = 2 * SUM_JOURNAL_SIZE; /* Step 3: restore summary entries */ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { unsigned short blk_off; unsigned int segno; seg_i = CURSEG_I(sbi, i); segno = le32_to_cpu(ckpt->cur_data_segno[i]); blk_off = le16_to_cpu(ckpt->cur_data_blkoff[i]); seg_i->next_segno = segno; reset_curseg(sbi, i, 0); seg_i->alloc_type = ckpt->alloc_type[i]; seg_i->next_blkoff = blk_off; if (seg_i->alloc_type == SSR) blk_off = BLKS_PER_SEG(sbi); for (j = 0; j < blk_off; j++) { struct f2fs_summary *s; s = (struct f2fs_summary *)(kaddr + offset); seg_i->sum_blk->entries[j] = *s; offset += SUMMARY_SIZE; if (offset + SUMMARY_SIZE <= PAGE_SIZE - SUM_FOOTER_SIZE) continue; f2fs_folio_put(folio, true); folio = f2fs_get_meta_folio(sbi, start++); if (IS_ERR(folio)) return PTR_ERR(folio); kaddr = folio_address(folio); offset = 0; } } f2fs_folio_put(folio, true); return 0; } static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct f2fs_summary_block *sum; struct curseg_info *curseg; struct folio *new; unsigned short blk_off; unsigned int segno = 0; block_t blk_addr = 0; int err = 0; /* get segment number and block addr */ if (IS_DATASEG(type)) { segno = le32_to_cpu(ckpt->cur_data_segno[type]); blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type - CURSEG_HOT_DATA]); if (__exist_node_summaries(sbi)) blk_addr = sum_blk_addr(sbi, NR_CURSEG_PERSIST_TYPE, type); else blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type); } else { segno = le32_to_cpu(ckpt->cur_node_segno[type - CURSEG_HOT_NODE]); blk_off = le16_to_cpu(ckpt->cur_node_blkoff[type - CURSEG_HOT_NODE]); if (__exist_node_summaries(sbi)) blk_addr = sum_blk_addr(sbi, NR_CURSEG_NODE_TYPE, type - CURSEG_HOT_NODE); else blk_addr = GET_SUM_BLOCK(sbi, segno); } new = f2fs_get_meta_folio(sbi, blk_addr); if (IS_ERR(new)) return PTR_ERR(new); sum = folio_address(new); if (IS_NODESEG(type)) { if (__exist_node_summaries(sbi)) { struct f2fs_summary *ns = &sum->entries[0]; int i; for (i = 0; i < BLKS_PER_SEG(sbi); i++, ns++) { ns->version = 0; ns->ofs_in_node = 0; } } else { err = f2fs_restore_node_summary(sbi, segno, sum); if (err) goto out; } } /* set uncompleted segment to curseg */ curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); /* update journal info */ down_write(&curseg->journal_rwsem); memcpy(curseg->journal, &sum->journal, SUM_JOURNAL_SIZE); up_write(&curseg->journal_rwsem); memcpy(curseg->sum_blk->entries, sum->entries, SUM_ENTRY_SIZE); memcpy(&curseg->sum_blk->footer, &sum->footer, SUM_FOOTER_SIZE); curseg->next_segno = segno; reset_curseg(sbi, type, 0); curseg->alloc_type = ckpt->alloc_type[type]; curseg->next_blkoff = blk_off; mutex_unlock(&curseg->curseg_mutex); out: f2fs_folio_put(new, true); return err; } static int restore_curseg_summaries(struct f2fs_sb_info *sbi) { struct f2fs_journal *sit_j = CURSEG_I(sbi, CURSEG_COLD_DATA)->journal; struct f2fs_journal *nat_j = CURSEG_I(sbi, CURSEG_HOT_DATA)->journal; int type = CURSEG_HOT_DATA; int err; if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) { int npages = f2fs_npages_for_summary_flush(sbi, true); if (npages >= 2) f2fs_ra_meta_pages(sbi, start_sum_block(sbi), npages, META_CP, true); /* restore for compacted data summary */ err = read_compacted_summaries(sbi); if (err) return err; type = CURSEG_HOT_NODE; } if (__exist_node_summaries(sbi)) f2fs_ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_PERSIST_TYPE, type), NR_CURSEG_PERSIST_TYPE - type, META_CP, true); for (; type <= CURSEG_COLD_NODE; type++) { err = read_normal_summaries(sbi, type); if (err) return err; } /* sanity check for summary blocks */ if (nats_in_cursum(nat_j) > NAT_JOURNAL_ENTRIES || sits_in_cursum(sit_j) > SIT_JOURNAL_ENTRIES) { f2fs_err(sbi, "invalid journal entries nats %u sits %u", nats_in_cursum(nat_j), sits_in_cursum(sit_j)); return -EINVAL; } return 0; } static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) { struct folio *folio; unsigned char *kaddr; struct f2fs_summary *summary; struct curseg_info *seg_i; int written_size = 0; int i, j; folio = f2fs_grab_meta_folio(sbi, blkaddr++); kaddr = folio_address(folio); memset(kaddr, 0, PAGE_SIZE); /* Step 1: write nat cache */ seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); memcpy(kaddr, seg_i->journal, SUM_JOURNAL_SIZE); written_size += SUM_JOURNAL_SIZE; /* Step 2: write sit cache */ seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); memcpy(kaddr + written_size, seg_i->journal, SUM_JOURNAL_SIZE); written_size += SUM_JOURNAL_SIZE; /* Step 3: write summary entries */ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { seg_i = CURSEG_I(sbi, i); for (j = 0; j < f2fs_curseg_valid_blocks(sbi, i); j++) { if (!folio) { folio = f2fs_grab_meta_folio(sbi, blkaddr++); kaddr = folio_address(folio); memset(kaddr, 0, PAGE_SIZE); written_size = 0; } summary = (struct f2fs_summary *)(kaddr + written_size); *summary = seg_i->sum_blk->entries[j]; written_size += SUMMARY_SIZE; if (written_size + SUMMARY_SIZE <= PAGE_SIZE - SUM_FOOTER_SIZE) continue; folio_mark_dirty(folio); f2fs_folio_put(folio, true); folio = NULL; } } if (folio) { folio_mark_dirty(folio); f2fs_folio_put(folio, true); } } static void write_normal_summaries(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { int i, end; if (IS_DATASEG(type)) end = type + NR_CURSEG_DATA_TYPE; else end = type + NR_CURSEG_NODE_TYPE; for (i = type; i < end; i++) write_current_sum_page(sbi, i, blkaddr + (i - type)); } void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk) { if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) write_compacted_summaries(sbi, start_blk); else write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA); } void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk) { write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE); } int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, unsigned int val, int alloc) { int i; if (type == NAT_JOURNAL) { for (i = 0; i < nats_in_cursum(journal); i++) { if (le32_to_cpu(nid_in_journal(journal, i)) == val) return i; } if (alloc && __has_cursum_space(journal, 1, NAT_JOURNAL)) return update_nats_in_cursum(journal, 1); } else if (type == SIT_JOURNAL) { for (i = 0; i < sits_in_cursum(journal); i++) if (le32_to_cpu(segno_in_journal(journal, i)) == val) return i; if (alloc && __has_cursum_space(journal, 1, SIT_JOURNAL)) return update_sits_in_cursum(journal, 1); } return -1; } static struct folio *get_current_sit_folio(struct f2fs_sb_info *sbi, unsigned int segno) { return f2fs_get_meta_folio(sbi, current_sit_addr(sbi, segno)); } static struct folio *get_next_sit_folio(struct f2fs_sb_info *sbi, unsigned int start) { struct sit_info *sit_i = SIT_I(sbi); struct folio *folio; pgoff_t src_off, dst_off; src_off = current_sit_addr(sbi, start); dst_off = next_sit_addr(sbi, src_off); folio = f2fs_grab_meta_folio(sbi, dst_off); seg_info_to_sit_folio(sbi, folio, start); folio_mark_dirty(folio); set_to_next_sit(sit_i, start); return folio; } static struct sit_entry_set *grab_sit_entry_set(void) { struct sit_entry_set *ses = f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_NOFS, true, NULL); ses->entry_cnt = 0; INIT_LIST_HEAD(&ses->set_list); return ses; } static void release_sit_entry_set(struct sit_entry_set *ses) { list_del(&ses->set_list); kmem_cache_free(sit_entry_set_slab, ses); } static void adjust_sit_entry_set(struct sit_entry_set *ses, struct list_head *head) { struct sit_entry_set *next = ses; if (list_is_last(&ses->set_list, head)) return; list_for_each_entry_continue(next, head, set_list) if (ses->entry_cnt <= next->entry_cnt) { list_move_tail(&ses->set_list, &next->set_list); return; } list_move_tail(&ses->set_list, head); } static void add_sit_entry(unsigned int segno, struct list_head *head) { struct sit_entry_set *ses; unsigned int start_segno = START_SEGNO(segno); list_for_each_entry(ses, head, set_list) { if (ses->start_segno == start_segno) { ses->entry_cnt++; adjust_sit_entry_set(ses, head); return; } } ses = grab_sit_entry_set(); ses->start_segno = start_segno; ses->entry_cnt++; list_add(&ses->set_list, head); } static void add_sits_in_set(struct f2fs_sb_info *sbi) { struct f2fs_sm_info *sm_info = SM_I(sbi); struct list_head *set_list = &sm_info->sit_entry_set; unsigned long *bitmap = SIT_I(sbi)->dirty_sentries_bitmap; unsigned int segno; for_each_set_bit(segno, bitmap, MAIN_SEGS(sbi)) add_sit_entry(segno, set_list); } static void remove_sits_in_journal(struct f2fs_sb_info *sbi) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); struct f2fs_journal *journal = curseg->journal; int i; down_write(&curseg->journal_rwsem); for (i = 0; i < sits_in_cursum(journal); i++) { unsigned int segno; bool dirtied; segno = le32_to_cpu(segno_in_journal(journal, i)); dirtied = __mark_sit_entry_dirty(sbi, segno); if (!dirtied) add_sit_entry(segno, &SM_I(sbi)->sit_entry_set); } update_sits_in_cursum(journal, -i); up_write(&curseg->journal_rwsem); } /* * CP calls this function, which flushes SIT entries including sit_journal, * and moves prefree segs to free segs. */ void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct sit_info *sit_i = SIT_I(sbi); unsigned long *bitmap = sit_i->dirty_sentries_bitmap; struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); struct f2fs_journal *journal = curseg->journal; struct sit_entry_set *ses, *tmp; struct list_head *head = &SM_I(sbi)->sit_entry_set; bool to_journal = !is_sbi_flag_set(sbi, SBI_IS_RESIZEFS); struct seg_entry *se; down_write(&sit_i->sentry_lock); if (!sit_i->dirty_sentries) goto out; /* * add and account sit entries of dirty bitmap in sit entry * set temporarily */ add_sits_in_set(sbi); /* * if there are no enough space in journal to store dirty sit * entries, remove all entries from journal and add and account * them in sit entry set. */ if (!__has_cursum_space(journal, sit_i->dirty_sentries, SIT_JOURNAL) || !to_journal) remove_sits_in_journal(sbi); /* * there are two steps to flush sit entries: * #1, flush sit entries to journal in current cold data summary block. * #2, flush sit entries to sit page. */ list_for_each_entry_safe(ses, tmp, head, set_list) { struct folio *folio = NULL; struct f2fs_sit_block *raw_sit = NULL; unsigned int start_segno = ses->start_segno; unsigned int end = min(start_segno + SIT_ENTRY_PER_BLOCK, (unsigned long)MAIN_SEGS(sbi)); unsigned int segno = start_segno; if (to_journal && !__has_cursum_space(journal, ses->entry_cnt, SIT_JOURNAL)) to_journal = false; if (to_journal) { down_write(&curseg->journal_rwsem); } else { folio = get_next_sit_folio(sbi, start_segno); raw_sit = folio_address(folio); } /* flush dirty sit entries in region of current sit set */ for_each_set_bit_from(segno, bitmap, end) { int offset, sit_offset; se = get_seg_entry(sbi, segno); #ifdef CONFIG_F2FS_CHECK_FS if (memcmp(se->cur_valid_map, se->cur_valid_map_mir, SIT_VBLOCK_MAP_SIZE)) f2fs_bug_on(sbi, 1); #endif /* add discard candidates */ if (!(cpc->reason & CP_DISCARD)) { cpc->trim_start = segno; add_discard_addrs(sbi, cpc, false); } if (to_journal) { offset = f2fs_lookup_journal_in_cursum(journal, SIT_JOURNAL, segno, 1); f2fs_bug_on(sbi, offset < 0); segno_in_journal(journal, offset) = cpu_to_le32(segno); seg_info_to_raw_sit(se, &sit_in_journal(journal, offset)); check_block_count(sbi, segno, &sit_in_journal(journal, offset)); } else { sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); seg_info_to_raw_sit(se, &raw_sit->entries[sit_offset]); check_block_count(sbi, segno, &raw_sit->entries[sit_offset]); } /* update ckpt_valid_block */ if (__is_large_section(sbi)) { set_ckpt_valid_blocks(sbi, segno); sanity_check_valid_blocks(sbi, segno); } __clear_bit(segno, bitmap); sit_i->dirty_sentries--; ses->entry_cnt--; } if (to_journal) up_write(&curseg->journal_rwsem); else f2fs_folio_put(folio, true); f2fs_bug_on(sbi, ses->entry_cnt); release_sit_entry_set(ses); } f2fs_bug_on(sbi, !list_empty(head)); f2fs_bug_on(sbi, sit_i->dirty_sentries); out: if (cpc->reason & CP_DISCARD) { __u64 trim_start = cpc->trim_start; for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) add_discard_addrs(sbi, cpc, false); cpc->trim_start = trim_start; } up_write(&sit_i->sentry_lock); set_prefree_as_free_segments(sbi); } static int build_sit_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct sit_info *sit_i; unsigned int sit_segs, start; char *src_bitmap, *bitmap; unsigned int bitmap_size, main_bitmap_size, sit_bitmap_size; unsigned int discard_map = f2fs_block_unit_discard(sbi) ? 1 : 0; /* allocate memory for SIT information */ sit_i = f2fs_kzalloc(sbi, sizeof(struct sit_info), GFP_KERNEL); if (!sit_i) return -ENOMEM; SM_I(sbi)->sit_info = sit_i; sit_i->sentries = f2fs_kvzalloc(sbi, array_size(sizeof(struct seg_entry), MAIN_SEGS(sbi)), GFP_KERNEL); if (!sit_i->sentries) return -ENOMEM; main_bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(sbi, main_bitmap_size, GFP_KERNEL); if (!sit_i->dirty_sentries_bitmap) return -ENOMEM; #ifdef CONFIG_F2FS_CHECK_FS bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * (3 + discard_map); #else bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * (2 + discard_map); #endif sit_i->bitmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!sit_i->bitmap) return -ENOMEM; bitmap = sit_i->bitmap; for (start = 0; start < MAIN_SEGS(sbi); start++) { sit_i->sentries[start].cur_valid_map = bitmap; bitmap += SIT_VBLOCK_MAP_SIZE; sit_i->sentries[start].ckpt_valid_map = bitmap; bitmap += SIT_VBLOCK_MAP_SIZE; #ifdef CONFIG_F2FS_CHECK_FS sit_i->sentries[start].cur_valid_map_mir = bitmap; bitmap += SIT_VBLOCK_MAP_SIZE; #endif if (discard_map) { sit_i->sentries[start].discard_map = bitmap; bitmap += SIT_VBLOCK_MAP_SIZE; } } sit_i->tmp_map = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); if (!sit_i->tmp_map) return -ENOMEM; if (__is_large_section(sbi)) { sit_i->sec_entries = f2fs_kvzalloc(sbi, array_size(sizeof(struct sec_entry), MAIN_SECS(sbi)), GFP_KERNEL); if (!sit_i->sec_entries) return -ENOMEM; } /* get information related with SIT */ sit_segs = le32_to_cpu(raw_super->segment_count_sit) >> 1; /* setup SIT bitmap from ckeckpoint pack */ sit_bitmap_size = __bitmap_size(sbi, SIT_BITMAP); src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP); sit_i->sit_bitmap = kmemdup(src_bitmap, sit_bitmap_size, GFP_KERNEL); if (!sit_i->sit_bitmap) return -ENOMEM; #ifdef CONFIG_F2FS_CHECK_FS sit_i->sit_bitmap_mir = kmemdup(src_bitmap, sit_bitmap_size, GFP_KERNEL); if (!sit_i->sit_bitmap_mir) return -ENOMEM; sit_i->invalid_segmap = f2fs_kvzalloc(sbi, main_bitmap_size, GFP_KERNEL); if (!sit_i->invalid_segmap) return -ENOMEM; #endif sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr); sit_i->sit_blocks = SEGS_TO_BLKS(sbi, sit_segs); sit_i->written_valid_blocks = 0; sit_i->bitmap_size = sit_bitmap_size; sit_i->dirty_sentries = 0; sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK; sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time); sit_i->mounted_time = ktime_get_boottime_seconds(); init_rwsem(&sit_i->sentry_lock); return 0; } static int build_free_segmap(struct f2fs_sb_info *sbi) { struct free_segmap_info *free_i; unsigned int bitmap_size, sec_bitmap_size; /* allocate memory for free segmap information */ free_i = f2fs_kzalloc(sbi, sizeof(struct free_segmap_info), GFP_KERNEL); if (!free_i) return -ENOMEM; SM_I(sbi)->free_info = free_i; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); free_i->free_segmap = f2fs_kvmalloc(sbi, bitmap_size, GFP_KERNEL); if (!free_i->free_segmap) return -ENOMEM; sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); free_i->free_secmap = f2fs_kvmalloc(sbi, sec_bitmap_size, GFP_KERNEL); if (!free_i->free_secmap) return -ENOMEM; /* set all segments as dirty temporarily */ memset(free_i->free_segmap, 0xff, bitmap_size); memset(free_i->free_secmap, 0xff, sec_bitmap_size); /* init free segmap information */ free_i->start_segno = GET_SEGNO_FROM_SEG0(sbi, MAIN_BLKADDR(sbi)); free_i->free_segments = 0; free_i->free_sections = 0; spin_lock_init(&free_i->segmap_lock); return 0; } static int build_curseg(struct f2fs_sb_info *sbi) { struct curseg_info *array; int i; array = f2fs_kzalloc(sbi, array_size(NR_CURSEG_TYPE, sizeof(*array)), GFP_KERNEL); if (!array) return -ENOMEM; SM_I(sbi)->curseg_array = array; for (i = 0; i < NO_CHECK_TYPE; i++) { mutex_init(&array[i].curseg_mutex); array[i].sum_blk = f2fs_kzalloc(sbi, PAGE_SIZE, GFP_KERNEL); if (!array[i].sum_blk) return -ENOMEM; init_rwsem(&array[i].journal_rwsem); array[i].journal = f2fs_kzalloc(sbi, sizeof(struct f2fs_journal), GFP_KERNEL); if (!array[i].journal) return -ENOMEM; array[i].seg_type = log_type_to_seg_type(i); reset_curseg_fields(&array[i]); } return restore_curseg_summaries(sbi); } static int build_sit_entries(struct f2fs_sb_info *sbi) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); struct f2fs_journal *journal = curseg->journal; struct seg_entry *se; struct f2fs_sit_entry sit; int sit_blk_cnt = SIT_BLK_CNT(sbi); unsigned int i, start, end; unsigned int readed, start_blk = 0; int err = 0; block_t sit_valid_blocks[2] = {0, 0}; do { readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_VECS, META_SIT, true); start = start_blk * sit_i->sents_per_block; end = (start_blk + readed) * sit_i->sents_per_block; for (; start < end && start < MAIN_SEGS(sbi); start++) { struct f2fs_sit_block *sit_blk; struct folio *folio; se = &sit_i->sentries[start]; folio = get_current_sit_folio(sbi, start); if (IS_ERR(folio)) return PTR_ERR(folio); sit_blk = folio_address(folio); sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; f2fs_folio_put(folio, true); err = check_block_count(sbi, start, &sit); if (err) return err; seg_info_from_raw_sit(se, &sit); if (se->type >= NR_PERSISTENT_LOG) { f2fs_err(sbi, "Invalid segment type: %u, segno: %u", se->type, start); f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE); return -EFSCORRUPTED; } sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks; if (!f2fs_block_unit_discard(sbi)) goto init_discard_map_done; /* build discard map only one time */ if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE); goto init_discard_map_done; } memcpy(se->discard_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); sbi->discard_blks += BLKS_PER_SEG(sbi) - se->valid_blocks; init_discard_map_done: if (__is_large_section(sbi)) get_sec_entry(sbi, start)->valid_blocks += se->valid_blocks; } start_blk += readed; } while (start_blk < sit_blk_cnt); down_read(&curseg->journal_rwsem); for (i = 0; i < sits_in_cursum(journal); i++) { unsigned int old_valid_blocks; start = le32_to_cpu(segno_in_journal(journal, i)); if (start >= MAIN_SEGS(sbi)) { f2fs_err(sbi, "Wrong journal entry on segno %u", start); err = -EFSCORRUPTED; f2fs_handle_error(sbi, ERROR_CORRUPTED_JOURNAL); break; } se = &sit_i->sentries[start]; sit = sit_in_journal(journal, i); old_valid_blocks = se->valid_blocks; sit_valid_blocks[SE_PAGETYPE(se)] -= old_valid_blocks; err = check_block_count(sbi, start, &sit); if (err) break; seg_info_from_raw_sit(se, &sit); if (se->type >= NR_PERSISTENT_LOG) { f2fs_err(sbi, "Invalid segment type: %u, segno: %u", se->type, start); err = -EFSCORRUPTED; f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE); break; } sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks; if (f2fs_block_unit_discard(sbi)) { if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE); } else { memcpy(se->discard_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); sbi->discard_blks += old_valid_blocks; sbi->discard_blks -= se->valid_blocks; } } if (__is_large_section(sbi)) { get_sec_entry(sbi, start)->valid_blocks += se->valid_blocks; get_sec_entry(sbi, start)->valid_blocks -= old_valid_blocks; } } up_read(&curseg->journal_rwsem); /* update ckpt_valid_block */ if (__is_large_section(sbi)) { unsigned int segno; for (segno = 0; segno < MAIN_SEGS(sbi); segno += SEGS_PER_SEC(sbi)) { set_ckpt_valid_blocks(sbi, segno); sanity_check_valid_blocks(sbi, segno); } } if (err) return err; if (sit_valid_blocks[NODE] != valid_node_count(sbi)) { f2fs_err(sbi, "SIT is corrupted node# %u vs %u", sit_valid_blocks[NODE], valid_node_count(sbi)); f2fs_handle_error(sbi, ERROR_INCONSISTENT_NODE_COUNT); return -EFSCORRUPTED; } if (sit_valid_blocks[DATA] + sit_valid_blocks[NODE] > valid_user_blocks(sbi)) { f2fs_err(sbi, "SIT is corrupted data# %u %u vs %u", sit_valid_blocks[DATA], sit_valid_blocks[NODE], valid_user_blocks(sbi)); f2fs_handle_error(sbi, ERROR_INCONSISTENT_BLOCK_COUNT); return -EFSCORRUPTED; } return 0; } static void init_free_segmap(struct f2fs_sb_info *sbi) { unsigned int start; int type; struct seg_entry *sentry; for (start = 0; start < MAIN_SEGS(sbi); start++) { if (f2fs_usable_blks_in_seg(sbi, start) == 0) continue; sentry = get_seg_entry(sbi, start); if (!sentry->valid_blocks) __set_free(sbi, start); else SIT_I(sbi)->written_valid_blocks += sentry->valid_blocks; } /* set use the current segments */ for (type = CURSEG_HOT_DATA; type <= CURSEG_COLD_NODE; type++) { struct curseg_info *curseg_t = CURSEG_I(sbi, type); __set_test_and_inuse(sbi, curseg_t->segno); } } static void init_dirty_segmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct free_segmap_info *free_i = FREE_I(sbi); unsigned int segno = 0, offset = 0, secno; block_t valid_blocks, usable_blks_in_seg; while (1) { /* find dirty segment based on free segmap */ segno = find_next_inuse(free_i, MAIN_SEGS(sbi), offset); if (segno >= MAIN_SEGS(sbi)) break; offset = segno + 1; valid_blocks = get_valid_blocks(sbi, segno, false); usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno); if (valid_blocks == usable_blks_in_seg || !valid_blocks) continue; if (valid_blocks > usable_blks_in_seg) { f2fs_bug_on(sbi, 1); continue; } mutex_lock(&dirty_i->seglist_lock); __locate_dirty_segment(sbi, segno, DIRTY); mutex_unlock(&dirty_i->seglist_lock); } if (!__is_large_section(sbi)) return; mutex_lock(&dirty_i->seglist_lock); for (segno = 0; segno < MAIN_SEGS(sbi); segno += SEGS_PER_SEC(sbi)) { valid_blocks = get_valid_blocks(sbi, segno, true); secno = GET_SEC_FROM_SEG(sbi, segno); if (!valid_blocks || valid_blocks == CAP_BLKS_PER_SEC(sbi)) continue; if (is_cursec(sbi, secno)) continue; set_bit(secno, dirty_i->dirty_secmap); } mutex_unlock(&dirty_i->seglist_lock); } static int init_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); dirty_i->victim_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!dirty_i->victim_secmap) return -ENOMEM; dirty_i->pinned_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!dirty_i->pinned_secmap) return -ENOMEM; dirty_i->pinned_secmap_cnt = 0; dirty_i->enable_pin_section = true; return 0; } static int build_dirty_segmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i; unsigned int bitmap_size, i; /* allocate memory for dirty segments list information */ dirty_i = f2fs_kzalloc(sbi, sizeof(struct dirty_seglist_info), GFP_KERNEL); if (!dirty_i) return -ENOMEM; SM_I(sbi)->dirty_info = dirty_i; mutex_init(&dirty_i->seglist_lock); bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); for (i = 0; i < NR_DIRTY_TYPE; i++) { dirty_i->dirty_segmap[i] = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!dirty_i->dirty_segmap[i]) return -ENOMEM; } if (__is_large_section(sbi)) { bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); dirty_i->dirty_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!dirty_i->dirty_secmap) return -ENOMEM; } init_dirty_segmap(sbi); return init_victim_secmap(sbi); } static int sanity_check_curseg(struct f2fs_sb_info *sbi) { int i; /* * In LFS/SSR curseg, .next_blkoff should point to an unused blkaddr; * In LFS curseg, all blkaddr after .next_blkoff should be unused. */ for (i = 0; i < NR_PERSISTENT_LOG; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); struct seg_entry *se = get_seg_entry(sbi, curseg->segno); unsigned int blkofs = curseg->next_blkoff; if (f2fs_sb_has_readonly(sbi) && i != CURSEG_HOT_DATA && i != CURSEG_HOT_NODE) continue; sanity_check_seg_type(sbi, curseg->seg_type); if (curseg->alloc_type != LFS && curseg->alloc_type != SSR) { f2fs_err(sbi, "Current segment has invalid alloc_type:%d", curseg->alloc_type); f2fs_handle_error(sbi, ERROR_INVALID_CURSEG); return -EFSCORRUPTED; } if (f2fs_test_bit(blkofs, se->cur_valid_map)) goto out; if (curseg->alloc_type == SSR) continue; for (blkofs += 1; blkofs < BLKS_PER_SEG(sbi); blkofs++) { if (!f2fs_test_bit(blkofs, se->cur_valid_map)) continue; out: f2fs_err(sbi, "Current segment's next free block offset is inconsistent with bitmap, logtype:%u, segno:%u, type:%u, next_blkoff:%u, blkofs:%u", i, curseg->segno, curseg->alloc_type, curseg->next_blkoff, blkofs); f2fs_handle_error(sbi, ERROR_INVALID_CURSEG); return -EFSCORRUPTED; } } return 0; } #ifdef CONFIG_BLK_DEV_ZONED static int check_zone_write_pointer(struct f2fs_sb_info *sbi, struct f2fs_dev_info *fdev, struct blk_zone *zone) { unsigned int zone_segno; block_t zone_block, valid_block_cnt; unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT; int ret; unsigned int nofs_flags; if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ) return 0; zone_block = fdev->start_blk + (zone->start >> log_sectors_per_block); zone_segno = GET_SEGNO(sbi, zone_block); /* * Skip check of zones cursegs point to, since * fix_curseg_write_pointer() checks them. */ if (zone_segno >= MAIN_SEGS(sbi)) return 0; /* * Get # of valid block of the zone. */ valid_block_cnt = get_valid_blocks(sbi, zone_segno, true); if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, zone_segno))) { f2fs_notice(sbi, "Open zones: valid block[0x%x,0x%x] cond[%s]", zone_segno, valid_block_cnt, blk_zone_cond_str(zone->cond)); return 0; } if ((!valid_block_cnt && zone->cond == BLK_ZONE_COND_EMPTY) || (valid_block_cnt && zone->cond == BLK_ZONE_COND_FULL)) return 0; if (!valid_block_cnt) { f2fs_notice(sbi, "Zone without valid block has non-zero write " "pointer. Reset the write pointer: cond[%s]", blk_zone_cond_str(zone->cond)); ret = __f2fs_issue_discard_zone(sbi, fdev->bdev, zone_block, zone->len >> log_sectors_per_block); if (ret) f2fs_err(sbi, "Discard zone failed: %s (errno=%d)", fdev->path, ret); return ret; } /* * If there are valid blocks and the write pointer doesn't match * with them, we need to report the inconsistency and fill * the zone till the end to close the zone. This inconsistency * does not cause write error because the zone will not be * selected for write operation until it get discarded. */ f2fs_notice(sbi, "Valid blocks are not aligned with write " "pointer: valid block[0x%x,0x%x] cond[%s]", zone_segno, valid_block_cnt, blk_zone_cond_str(zone->cond)); nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH, zone->start, zone->len); memalloc_nofs_restore(nofs_flags); if (ret == -EOPNOTSUPP) { ret = blkdev_issue_zeroout(fdev->bdev, zone->wp, zone->len - (zone->wp - zone->start), GFP_NOFS, 0); if (ret) f2fs_err(sbi, "Fill up zone failed: %s (errno=%d)", fdev->path, ret); } else if (ret) { f2fs_err(sbi, "Finishing zone failed: %s (errno=%d)", fdev->path, ret); } return ret; } static struct f2fs_dev_info *get_target_zoned_dev(struct f2fs_sb_info *sbi, block_t zone_blkaddr) { int i; for (i = 0; i < sbi->s_ndevs; i++) { if (!bdev_is_zoned(FDEV(i).bdev)) continue; if (sbi->s_ndevs == 1 || (FDEV(i).start_blk <= zone_blkaddr && zone_blkaddr <= FDEV(i).end_blk)) return &FDEV(i); } return NULL; } static int report_one_zone_cb(struct blk_zone *zone, unsigned int idx, void *data) { memcpy(data, zone, sizeof(struct blk_zone)); return 0; } static int do_fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) { struct curseg_info *cs = CURSEG_I(sbi, type); struct f2fs_dev_info *zbd; struct blk_zone zone; unsigned int cs_section, wp_segno, wp_blkoff, wp_sector_off; block_t cs_zone_block, wp_block; unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT; sector_t zone_sector; int err; cs_section = GET_SEC_FROM_SEG(sbi, cs->segno); cs_zone_block = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, cs_section)); zbd = get_target_zoned_dev(sbi, cs_zone_block); if (!zbd) return 0; /* report zone for the sector the curseg points to */ zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) << log_sectors_per_block; err = blkdev_report_zones(zbd->bdev, zone_sector, 1, report_one_zone_cb, &zone); if (err != 1) { f2fs_err(sbi, "Report zone failed: %s errno=(%d)", zbd->path, err); return err; } if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ) return 0; /* * When safely unmounted in the previous mount, we could use current * segments. Otherwise, allocate new sections. */ if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { wp_block = zbd->start_blk + (zone.wp >> log_sectors_per_block); wp_segno = GET_SEGNO(sbi, wp_block); wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno); wp_sector_off = zone.wp & GENMASK(log_sectors_per_block - 1, 0); if (cs->segno == wp_segno && cs->next_blkoff == wp_blkoff && wp_sector_off == 0) return 0; f2fs_notice(sbi, "Unaligned curseg[%d] with write pointer: " "curseg[0x%x,0x%x] wp[0x%x,0x%x]", type, cs->segno, cs->next_blkoff, wp_segno, wp_blkoff); } /* Allocate a new section if it's not new. */ if (cs->next_blkoff || cs->segno != GET_SEG_FROM_SEC(sbi, GET_ZONE_FROM_SEC(sbi, cs_section))) { unsigned int old_segno = cs->segno, old_blkoff = cs->next_blkoff; f2fs_allocate_new_section(sbi, type, true); f2fs_notice(sbi, "Assign new section to curseg[%d]: " "[0x%x,0x%x] -> [0x%x,0x%x]", type, old_segno, old_blkoff, cs->segno, cs->next_blkoff); } /* check consistency of the zone curseg pointed to */ if (check_zone_write_pointer(sbi, zbd, &zone)) return -EIO; /* check newly assigned zone */ cs_section = GET_SEC_FROM_SEG(sbi, cs->segno); cs_zone_block = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, cs_section)); zbd = get_target_zoned_dev(sbi, cs_zone_block); if (!zbd) return 0; zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) << log_sectors_per_block; err = blkdev_report_zones(zbd->bdev, zone_sector, 1, report_one_zone_cb, &zone); if (err != 1) { f2fs_err(sbi, "Report zone failed: %s errno=(%d)", zbd->path, err); return err; } if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ) return 0; if (zone.wp != zone.start) { f2fs_notice(sbi, "New zone for curseg[%d] is not yet discarded. " "Reset the zone: curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff); err = __f2fs_issue_discard_zone(sbi, zbd->bdev, cs_zone_block, zone.len >> log_sectors_per_block); if (err) { f2fs_err(sbi, "Discard zone failed: %s (errno=%d)", zbd->path, err); return err; } } return 0; } static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi) { int i, ret; for (i = 0; i < NR_PERSISTENT_LOG; i++) { ret = do_fix_curseg_write_pointer(sbi, i); if (ret) return ret; } return 0; } struct check_zone_write_pointer_args { struct f2fs_sb_info *sbi; struct f2fs_dev_info *fdev; }; static int check_zone_write_pointer_cb(struct blk_zone *zone, unsigned int idx, void *data) { struct check_zone_write_pointer_args *args; args = (struct check_zone_write_pointer_args *)data; return check_zone_write_pointer(args->sbi, args->fdev, zone); } static int check_write_pointer(struct f2fs_sb_info *sbi) { int i, ret; struct check_zone_write_pointer_args args; for (i = 0; i < sbi->s_ndevs; i++) { if (!bdev_is_zoned(FDEV(i).bdev)) continue; args.sbi = sbi; args.fdev = &FDEV(i); ret = blkdev_report_zones(FDEV(i).bdev, 0, BLK_ALL_ZONES, check_zone_write_pointer_cb, &args); if (ret < 0) return ret; } return 0; } int f2fs_check_and_fix_write_pointer(struct f2fs_sb_info *sbi) { int ret; if (!f2fs_sb_has_blkzoned(sbi) || f2fs_readonly(sbi->sb) || f2fs_hw_is_readonly(sbi)) return 0; f2fs_notice(sbi, "Checking entire write pointers"); ret = fix_curseg_write_pointer(sbi); if (!ret) ret = check_write_pointer(sbi); return ret; } /* * Return the number of usable blocks in a segment. The number of blocks * returned is always equal to the number of blocks in a segment for * segments fully contained within a sequential zone capacity or a * conventional zone. For segments partially contained in a sequential * zone capacity, the number of usable blocks up to the zone capacity * is returned. 0 is returned in all other cases. */ static inline unsigned int f2fs_usable_zone_blks_in_seg( struct f2fs_sb_info *sbi, unsigned int segno) { block_t seg_start, sec_start_blkaddr, sec_cap_blkaddr; unsigned int secno; if (!sbi->unusable_blocks_per_sec) return BLKS_PER_SEG(sbi); secno = GET_SEC_FROM_SEG(sbi, segno); seg_start = START_BLOCK(sbi, segno); sec_start_blkaddr = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, secno)); sec_cap_blkaddr = sec_start_blkaddr + CAP_BLKS_PER_SEC(sbi); /* * If segment starts before zone capacity and spans beyond * zone capacity, then usable blocks are from seg start to * zone capacity. If the segment starts after the zone capacity, * then there are no usable blocks. */ if (seg_start >= sec_cap_blkaddr) return 0; if (seg_start + BLKS_PER_SEG(sbi) > sec_cap_blkaddr) return sec_cap_blkaddr - seg_start; return BLKS_PER_SEG(sbi); } #else int f2fs_check_and_fix_write_pointer(struct f2fs_sb_info *sbi) { return 0; } static inline unsigned int f2fs_usable_zone_blks_in_seg(struct f2fs_sb_info *sbi, unsigned int segno) { return 0; } #endif unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, unsigned int segno) { if (f2fs_sb_has_blkzoned(sbi)) return f2fs_usable_zone_blks_in_seg(sbi, segno); return BLKS_PER_SEG(sbi); } unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi) { if (f2fs_sb_has_blkzoned(sbi)) return CAP_SEGS_PER_SEC(sbi); return SEGS_PER_SEC(sbi); } unsigned long long f2fs_get_section_mtime(struct f2fs_sb_info *sbi, unsigned int segno) { unsigned int usable_segs_per_sec = f2fs_usable_segs_in_sec(sbi); unsigned int secno = 0, start = 0; unsigned int total_valid_blocks = 0; unsigned long long mtime = 0; unsigned int i = 0; secno = GET_SEC_FROM_SEG(sbi, segno); start = GET_SEG_FROM_SEC(sbi, secno); if (!__is_large_section(sbi)) { mtime = get_seg_entry(sbi, start + i)->mtime; goto out; } for (i = 0; i < usable_segs_per_sec; i++) { /* for large section, only check the mtime of valid segments */ struct seg_entry *se = get_seg_entry(sbi, start+i); mtime += se->mtime * se->valid_blocks; total_valid_blocks += se->valid_blocks; } if (total_valid_blocks == 0) return INVALID_MTIME; mtime = div_u64(mtime, total_valid_blocks); out: if (unlikely(mtime == INVALID_MTIME)) mtime -= 1; return mtime; } /* * Update min, max modified time for cost-benefit GC algorithm */ static void init_min_max_mtime(struct f2fs_sb_info *sbi) { struct sit_info *sit_i = SIT_I(sbi); unsigned int segno; down_write(&sit_i->sentry_lock); sit_i->min_mtime = ULLONG_MAX; for (segno = 0; segno < MAIN_SEGS(sbi); segno += SEGS_PER_SEC(sbi)) { unsigned long long mtime = 0; mtime = f2fs_get_section_mtime(sbi, segno); if (sit_i->min_mtime > mtime) sit_i->min_mtime = mtime; } sit_i->max_mtime = get_mtime(sbi, false); sit_i->dirty_max_mtime = 0; up_write(&sit_i->sentry_lock); } int f2fs_build_segment_manager(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct f2fs_sm_info *sm_info; int err; sm_info = f2fs_kzalloc(sbi, sizeof(struct f2fs_sm_info), GFP_KERNEL); if (!sm_info) return -ENOMEM; /* init sm info */ sbi->sm_info = sm_info; sm_info->seg0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); sm_info->main_blkaddr = le32_to_cpu(raw_super->main_blkaddr); sm_info->segment_count = le32_to_cpu(raw_super->segment_count); sm_info->reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count); sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main); sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); sm_info->rec_prefree_segments = sm_info->main_segments * DEF_RECLAIM_PREFREE_SEGMENTS / 100; if (sm_info->rec_prefree_segments > DEF_MAX_RECLAIM_PREFREE_SEGMENTS) sm_info->rec_prefree_segments = DEF_MAX_RECLAIM_PREFREE_SEGMENTS; if (!f2fs_lfs_mode(sbi)) sm_info->ipu_policy = BIT(F2FS_IPU_FSYNC); sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; sm_info->min_seq_blocks = BLKS_PER_SEG(sbi); sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS; sm_info->min_ssr_sections = reserved_sections(sbi); INIT_LIST_HEAD(&sm_info->sit_entry_set); init_f2fs_rwsem(&sm_info->curseg_lock); err = f2fs_create_flush_cmd_control(sbi); if (err) return err; err = create_discard_cmd_control(sbi); if (err) return err; err = build_sit_info(sbi); if (err) return err; err = build_free_segmap(sbi); if (err) return err; err = build_curseg(sbi); if (err) return err; /* reinit free segmap based on SIT */ err = build_sit_entries(sbi); if (err) return err; init_free_segmap(sbi); err = build_dirty_segmap(sbi); if (err) return err; err = sanity_check_curseg(sbi); if (err) return err; init_min_max_mtime(sbi); return 0; } static void discard_dirty_segmap(struct f2fs_sb_info *sbi, enum dirty_type dirty_type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); mutex_lock(&dirty_i->seglist_lock); kvfree(dirty_i->dirty_segmap[dirty_type]); dirty_i->nr_dirty[dirty_type] = 0; mutex_unlock(&dirty_i->seglist_lock); } static void destroy_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); kvfree(dirty_i->pinned_secmap); kvfree(dirty_i->victim_secmap); } static void destroy_dirty_segmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); int i; if (!dirty_i) return; /* discard pre-free/dirty segments list */ for (i = 0; i < NR_DIRTY_TYPE; i++) discard_dirty_segmap(sbi, i); if (__is_large_section(sbi)) { mutex_lock(&dirty_i->seglist_lock); kvfree(dirty_i->dirty_secmap); mutex_unlock(&dirty_i->seglist_lock); } destroy_victim_secmap(sbi); SM_I(sbi)->dirty_info = NULL; kfree(dirty_i); } static void destroy_curseg(struct f2fs_sb_info *sbi) { struct curseg_info *array = SM_I(sbi)->curseg_array; int i; if (!array) return; SM_I(sbi)->curseg_array = NULL; for (i = 0; i < NR_CURSEG_TYPE; i++) { kfree(array[i].sum_blk); kfree(array[i].journal); } kfree(array); } static void destroy_free_segmap(struct f2fs_sb_info *sbi) { struct free_segmap_info *free_i = SM_I(sbi)->free_info; if (!free_i) return; SM_I(sbi)->free_info = NULL; kvfree(free_i->free_segmap); kvfree(free_i->free_secmap); kfree(free_i); } static void destroy_sit_info(struct f2fs_sb_info *sbi) { struct sit_info *sit_i = SIT_I(sbi); if (!sit_i) return; if (sit_i->sentries) kvfree(sit_i->bitmap); kfree(sit_i->tmp_map); kvfree(sit_i->sentries); kvfree(sit_i->sec_entries); kvfree(sit_i->dirty_sentries_bitmap); SM_I(sbi)->sit_info = NULL; kfree(sit_i->sit_bitmap); #ifdef CONFIG_F2FS_CHECK_FS kfree(sit_i->sit_bitmap_mir); kvfree(sit_i->invalid_segmap); #endif kfree(sit_i); } void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi) { struct f2fs_sm_info *sm_info = SM_I(sbi); if (!sm_info) return; f2fs_destroy_flush_cmd_control(sbi, true); destroy_discard_cmd_control(sbi); destroy_dirty_segmap(sbi); destroy_curseg(sbi); destroy_free_segmap(sbi); destroy_sit_info(sbi); sbi->sm_info = NULL; kfree(sm_info); } int __init f2fs_create_segment_manager_caches(void) { discard_entry_slab = f2fs_kmem_cache_create("f2fs_discard_entry", sizeof(struct discard_entry)); if (!discard_entry_slab) goto fail; discard_cmd_slab = f2fs_kmem_cache_create("f2fs_discard_cmd", sizeof(struct discard_cmd)); if (!discard_cmd_slab) goto destroy_discard_entry; sit_entry_set_slab = f2fs_kmem_cache_create("f2fs_sit_entry_set", sizeof(struct sit_entry_set)); if (!sit_entry_set_slab) goto destroy_discard_cmd; revoke_entry_slab = f2fs_kmem_cache_create("f2fs_revoke_entry", sizeof(struct revoke_entry)); if (!revoke_entry_slab) goto destroy_sit_entry_set; return 0; destroy_sit_entry_set: kmem_cache_destroy(sit_entry_set_slab); destroy_discard_cmd: kmem_cache_destroy(discard_cmd_slab); destroy_discard_entry: kmem_cache_destroy(discard_entry_slab); fail: return -ENOMEM; } void f2fs_destroy_segment_manager_caches(void) { kmem_cache_destroy(sit_entry_set_slab); kmem_cache_destroy(discard_cmd_slab); kmem_cache_destroy(discard_entry_slab); kmem_cache_destroy(revoke_entry_slab); }
21 13 8 40 40 62 63 8 9 10 10 7 7 5 8 1 1 8 8 1 9 9 9 9 9 9 9 3 3 42 7 36 36 34 36 36 35 35 8 8 8 8 36 36 35 2 11 11 2 11 1 11 5 5 2 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 /* Linux multicast routing support * Common logic shared by IPv4 [ipmr] and IPv6 [ip6mr] implementation */ #include <linux/rhashtable.h> #include <linux/mroute_base.h> /* Sets everything common except 'dev', since that is done under locking */ void vif_device_init(struct vif_device *v, struct net_device *dev, unsigned long rate_limit, unsigned char threshold, unsigned short flags, unsigned short get_iflink_mask) { RCU_INIT_POINTER(v->dev, NULL); v->bytes_in = 0; v->bytes_out = 0; v->pkt_in = 0; v->pkt_out = 0; v->rate_limit = rate_limit; v->flags = flags; v->threshold = threshold; if (v->flags & get_iflink_mask) v->link = dev_get_iflink(dev); else v->link = dev->ifindex; } EXPORT_SYMBOL(vif_device_init); struct mr_table * mr_table_alloc(struct net *net, u32 id, struct mr_table_ops *ops, void (*expire_func)(struct timer_list *t), void (*table_set)(struct mr_table *mrt, struct net *net)) { struct mr_table *mrt; int err; mrt = kzalloc(sizeof(*mrt), GFP_KERNEL); if (!mrt) return ERR_PTR(-ENOMEM); mrt->id = id; write_pnet(&mrt->net, net); mrt->ops = *ops; err = rhltable_init(&mrt->mfc_hash, mrt->ops.rht_params); if (err) { kfree(mrt); return ERR_PTR(err); } INIT_LIST_HEAD(&mrt->mfc_cache_list); INIT_LIST_HEAD(&mrt->mfc_unres_queue); timer_setup(&mrt->ipmr_expire_timer, expire_func, 0); mrt->mroute_reg_vif_num = -1; table_set(mrt, net); return mrt; } EXPORT_SYMBOL(mr_table_alloc); void *mr_mfc_find_parent(struct mr_table *mrt, void *hasharg, int parent) { struct rhlist_head *tmp, *list; struct mr_mfc *c; list = rhltable_lookup(&mrt->mfc_hash, hasharg, *mrt->ops.rht_params); rhl_for_each_entry_rcu(c, tmp, list, mnode) if (parent == -1 || parent == c->mfc_parent) return c; return NULL; } EXPORT_SYMBOL(mr_mfc_find_parent); void *mr_mfc_find_any_parent(struct mr_table *mrt, int vifi) { struct rhlist_head *tmp, *list; struct mr_mfc *c; list = rhltable_lookup(&mrt->mfc_hash, mrt->ops.cmparg_any, *mrt->ops.rht_params); rhl_for_each_entry_rcu(c, tmp, list, mnode) if (c->mfc_un.res.ttls[vifi] < 255) return c; return NULL; } EXPORT_SYMBOL(mr_mfc_find_any_parent); void *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg) { struct rhlist_head *tmp, *list; struct mr_mfc *c, *proxy; list = rhltable_lookup(&mrt->mfc_hash, hasharg, *mrt->ops.rht_params); rhl_for_each_entry_rcu(c, tmp, list, mnode) { if (c->mfc_un.res.ttls[vifi] < 255) return c; /* It's ok if the vifi is part of the static tree */ proxy = mr_mfc_find_any_parent(mrt, c->mfc_parent); if (proxy && proxy->mfc_un.res.ttls[vifi] < 255) return c; } return mr_mfc_find_any_parent(mrt, vifi); } EXPORT_SYMBOL(mr_mfc_find_any); #ifdef CONFIG_PROC_FS void *mr_vif_seq_idx(struct net *net, struct mr_vif_iter *iter, loff_t pos) { struct mr_table *mrt = iter->mrt; for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) { if (!VIF_EXISTS(mrt, iter->ct)) continue; if (pos-- == 0) return &mrt->vif_table[iter->ct]; } return NULL; } EXPORT_SYMBOL(mr_vif_seq_idx); void *mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct mr_vif_iter *iter = seq->private; struct net *net = seq_file_net(seq); struct mr_table *mrt = iter->mrt; ++*pos; if (v == SEQ_START_TOKEN) return mr_vif_seq_idx(net, iter, 0); while (++iter->ct < mrt->maxvif) { if (!VIF_EXISTS(mrt, iter->ct)) continue; return &mrt->vif_table[iter->ct]; } return NULL; } EXPORT_SYMBOL(mr_vif_seq_next); void *mr_mfc_seq_idx(struct net *net, struct mr_mfc_iter *it, loff_t pos) { struct mr_table *mrt = it->mrt; struct mr_mfc *mfc; rcu_read_lock(); it->cache = &mrt->mfc_cache_list; list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) if (pos-- == 0) return mfc; rcu_read_unlock(); spin_lock_bh(it->lock); it->cache = &mrt->mfc_unres_queue; list_for_each_entry(mfc, it->cache, list) if (pos-- == 0) return mfc; spin_unlock_bh(it->lock); it->cache = NULL; return NULL; } EXPORT_SYMBOL(mr_mfc_seq_idx); void *mr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct mr_mfc_iter *it = seq->private; struct net *net = seq_file_net(seq); struct mr_table *mrt = it->mrt; struct mr_mfc *c = v; ++*pos; if (v == SEQ_START_TOKEN) return mr_mfc_seq_idx(net, seq->private, 0); if (c->list.next != it->cache) return list_entry(c->list.next, struct mr_mfc, list); if (it->cache == &mrt->mfc_unres_queue) goto end_of_list; /* exhausted cache_array, show unresolved */ rcu_read_unlock(); it->cache = &mrt->mfc_unres_queue; spin_lock_bh(it->lock); if (!list_empty(it->cache)) return list_first_entry(it->cache, struct mr_mfc, list); end_of_list: spin_unlock_bh(it->lock); it->cache = NULL; return NULL; } EXPORT_SYMBOL(mr_mfc_seq_next); #endif int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct mr_mfc *c, struct rtmsg *rtm) { struct net_device *vif_dev; struct rta_mfc_stats mfcs; struct nlattr *mp_attr; struct rtnexthop *nhp; unsigned long lastuse; int ct; /* If cache is unresolved, don't try to parse IIF and OIF */ if (c->mfc_parent >= MAXVIFS) { rtm->rtm_flags |= RTNH_F_UNRESOLVED; return -ENOENT; } rcu_read_lock(); vif_dev = rcu_dereference(mrt->vif_table[c->mfc_parent].dev); if (vif_dev && nla_put_u32(skb, RTA_IIF, vif_dev->ifindex) < 0) { rcu_read_unlock(); return -EMSGSIZE; } rcu_read_unlock(); if (c->mfc_flags & MFC_OFFLOAD) rtm->rtm_flags |= RTNH_F_OFFLOAD; mp_attr = nla_nest_start_noflag(skb, RTA_MULTIPATH); if (!mp_attr) return -EMSGSIZE; rcu_read_lock(); for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { struct vif_device *vif = &mrt->vif_table[ct]; vif_dev = rcu_dereference(vif->dev); if (vif_dev && c->mfc_un.res.ttls[ct] < 255) { nhp = nla_reserve_nohdr(skb, sizeof(*nhp)); if (!nhp) { rcu_read_unlock(); nla_nest_cancel(skb, mp_attr); return -EMSGSIZE; } nhp->rtnh_flags = 0; nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; nhp->rtnh_ifindex = vif_dev->ifindex; nhp->rtnh_len = sizeof(*nhp); } } rcu_read_unlock(); nla_nest_end(skb, mp_attr); lastuse = READ_ONCE(c->mfc_un.res.lastuse); lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0; mfcs.mfcs_packets = atomic_long_read(&c->mfc_un.res.pkt); mfcs.mfcs_bytes = atomic_long_read(&c->mfc_un.res.bytes); mfcs.mfcs_wrong_if = atomic_long_read(&c->mfc_un.res.wrong_if); if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) || nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse), RTA_PAD)) return -EMSGSIZE; rtm->rtm_type = RTN_MULTICAST; return 1; } EXPORT_SYMBOL(mr_fill_mroute); static bool mr_mfc_uses_dev(const struct mr_table *mrt, const struct mr_mfc *c, const struct net_device *dev) { int ct; for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { const struct net_device *vif_dev; const struct vif_device *vif; vif = &mrt->vif_table[ct]; vif_dev = rcu_access_pointer(vif->dev); if (vif_dev && c->mfc_un.res.ttls[ct] < 255 && vif_dev == dev) return true; } return false; } int mr_table_dump(struct mr_table *mrt, struct sk_buff *skb, struct netlink_callback *cb, int (*fill)(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags), spinlock_t *lock, struct fib_dump_filter *filter) { unsigned int e = 0, s_e = cb->args[1]; unsigned int flags = NLM_F_MULTI; struct mr_mfc *mfc; int err; if (filter->filter_set) flags |= NLM_F_DUMP_FILTERED; list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list, lockdep_rtnl_is_held()) { if (e < s_e) goto next_entry; if (filter->dev && !mr_mfc_uses_dev(mrt, mfc, filter->dev)) goto next_entry; err = fill(mrt, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags); if (err < 0) goto out; next_entry: e++; } spin_lock_bh(lock); list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) { if (e < s_e) goto next_entry2; err = fill(mrt, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags); if (err < 0) { spin_unlock_bh(lock); goto out; } next_entry2: e++; } spin_unlock_bh(lock); err = 0; out: cb->args[1] = e; return err; } EXPORT_SYMBOL(mr_table_dump); int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, struct mr_table *(*iter)(struct net *net, struct mr_table *mrt), int (*fill)(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags), spinlock_t *lock, struct fib_dump_filter *filter) { unsigned int t = 0, s_t = cb->args[0]; struct net *net = sock_net(skb->sk); struct mr_table *mrt; int err; /* multicast does not track protocol or have route type other * than RTN_MULTICAST */ if (filter->filter_set) { if (filter->protocol || filter->flags || (filter->rt_type && filter->rt_type != RTN_MULTICAST)) return skb->len; } rcu_read_lock(); for (mrt = iter(net, NULL); mrt; mrt = iter(net, mrt)) { if (t < s_t) goto next_table; err = mr_table_dump(mrt, skb, cb, fill, lock, filter); if (err < 0) break; cb->args[1] = 0; next_table: t++; } rcu_read_unlock(); cb->args[0] = t; return skb->len; } EXPORT_SYMBOL(mr_rtm_dumproute); int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, int (*rules_dump)(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack), struct mr_table *(*mr_iter)(struct net *net, struct mr_table *mrt), struct netlink_ext_ack *extack) { struct mr_table *mrt; int err; err = rules_dump(net, nb, extack); if (err) return err; for (mrt = mr_iter(net, NULL); mrt; mrt = mr_iter(net, mrt)) { struct vif_device *v = &mrt->vif_table[0]; struct net_device *vif_dev; struct mr_mfc *mfc; int vifi; /* Notifiy on table VIF entries */ rcu_read_lock(); for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) { vif_dev = rcu_dereference(v->dev); if (!vif_dev) continue; err = mr_call_vif_notifier(nb, family, FIB_EVENT_VIF_ADD, v, vif_dev, vifi, mrt->id, extack); if (err) break; } rcu_read_unlock(); if (err) return err; /* Notify on table MFC entries */ list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) { err = mr_call_mfc_notifier(nb, family, FIB_EVENT_ENTRY_ADD, mfc, mrt->id, extack); if (err) return err; } } return 0; } EXPORT_SYMBOL(mr_dump);
1 5 5 5 5 2 3 4 5 1 4 5 5 4 1 3 3 3 3 1 2 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 // SPDX-License-Identifier: GPL-2.0-or-later /* * USB Wacom tablet support - Wacom specific code */ #include "wacom_wac.h" #include "wacom.h" #include <linux/input/mt.h> #include <linux/jiffies.h> /* resolution for penabled devices */ #define WACOM_PL_RES 20 #define WACOM_PENPRTN_RES 40 #define WACOM_VOLITO_RES 50 #define WACOM_GRAPHIRE_RES 80 #define WACOM_INTUOS_RES 100 #define WACOM_INTUOS3_RES 200 /* Newer Cintiq and DTU have an offset between tablet and screen areas */ #define WACOM_DTU_OFFSET 200 #define WACOM_CINTIQ_OFFSET 400 /* * Scale factor relating reported contact size to logical contact area. * 2^14/pi is a good approximation on Intuos5 and 3rd-gen Bamboo */ #define WACOM_CONTACT_AREA_SCALE 2607 static bool touch_arbitration = 1; module_param(touch_arbitration, bool, 0644); MODULE_PARM_DESC(touch_arbitration, " on (Y) off (N)"); static void wacom_report_numbered_buttons(struct input_dev *input_dev, int button_count, int mask); static int wacom_numbered_button_to_key(int n); static void wacom_update_led(struct wacom *wacom, int button_count, int mask, int group); static void wacom_force_proxout(struct wacom_wac *wacom_wac) { struct input_dev *input = wacom_wac->pen_input; wacom_wac->shared->stylus_in_proximity = 0; input_report_key(input, BTN_TOUCH, 0); input_report_key(input, BTN_STYLUS, 0); input_report_key(input, BTN_STYLUS2, 0); input_report_key(input, BTN_STYLUS3, 0); input_report_key(input, wacom_wac->tool[0], 0); if (wacom_wac->serial[0]) { input_report_abs(input, ABS_MISC, 0); } input_report_abs(input, ABS_PRESSURE, 0); wacom_wac->tool[0] = 0; wacom_wac->id[0] = 0; wacom_wac->serial[0] = 0; input_sync(input); } void wacom_idleprox_timeout(struct timer_list *list) { struct wacom *wacom = timer_container_of(wacom, list, idleprox_timer); struct wacom_wac *wacom_wac = &wacom->wacom_wac; if (!wacom_wac->hid_data.sense_state) { return; } hid_warn(wacom->hdev, "%s: tool appears to be hung in-prox. forcing it out.\n", __func__); wacom_force_proxout(wacom_wac); } /* * Percent of battery capacity for Graphire. * 8th value means AC online and show 100% capacity. */ static unsigned short batcap_gr[8] = { 1, 15, 25, 35, 50, 70, 100, 100 }; /* * Percent of battery capacity for Intuos4 WL, AC has a separate bit. */ static unsigned short batcap_i4[8] = { 1, 15, 30, 45, 60, 70, 85, 100 }; static void __wacom_notify_battery(struct wacom_battery *battery, int bat_status, int bat_capacity, bool bat_charging, bool bat_connected, bool ps_connected) { bool changed = battery->bat_status != bat_status || battery->battery_capacity != bat_capacity || battery->bat_charging != bat_charging || battery->bat_connected != bat_connected || battery->ps_connected != ps_connected; if (changed) { battery->bat_status = bat_status; battery->battery_capacity = bat_capacity; battery->bat_charging = bat_charging; battery->bat_connected = bat_connected; battery->ps_connected = ps_connected; if (battery->battery) power_supply_changed(battery->battery); } } static void wacom_notify_battery(struct wacom_wac *wacom_wac, int bat_status, int bat_capacity, bool bat_charging, bool bat_connected, bool ps_connected) { struct wacom *wacom = container_of(wacom_wac, struct wacom, wacom_wac); bool bat_initialized = wacom->battery.battery; bool has_quirk = wacom_wac->features.quirks & WACOM_QUIRK_BATTERY; if (bat_initialized != has_quirk) wacom_schedule_work(wacom_wac, WACOM_WORKER_BATTERY); __wacom_notify_battery(&wacom->battery, bat_status, bat_capacity, bat_charging, bat_connected, ps_connected); } static int wacom_penpartner_irq(struct wacom_wac *wacom) { unsigned char *data = wacom->data; struct input_dev *input = wacom->pen_input; switch (data[0]) { case 1: if (data[5] & 0x80) { wacom->tool[0] = (data[5] & 0x20) ? BTN_TOOL_RUBBER : BTN_TOOL_PEN; wacom->id[0] = (data[5] & 0x20) ? ERASER_DEVICE_ID : STYLUS_DEVICE_ID; input_report_key(input, wacom->tool[0], 1); input_report_abs(input, ABS_MISC, wacom->id[0]); /* report tool id */ input_report_abs(input, ABS_X, get_unaligned_le16(&data[1])); input_report_abs(input, ABS_Y, get_unaligned_le16(&data[3])); input_report_abs(input, ABS_PRESSURE, (signed char)data[6] + 127); input_report_key(input, BTN_TOUCH, ((signed char)data[6] > -127)); input_report_key(input, BTN_STYLUS, (data[5] & 0x40)); } else { input_report_key(input, wacom->tool[0], 0); input_report_abs(input, ABS_MISC, 0); /* report tool id */ input_report_abs(input, ABS_PRESSURE, -1); input_report_key(input, BTN_TOUCH, 0); } break; case 2: input_report_key(input, BTN_TOOL_PEN, 1); input_report_abs(input, ABS_MISC, STYLUS_DEVICE_ID); /* report tool id */ input_report_abs(input, ABS_X, get_unaligned_le16(&data[1])); input_report_abs(input, ABS_Y, get_unaligned_le16(&data[3])); input_report_abs(input, ABS_PRESSURE, (signed char)data[6] + 127); input_report_key(input, BTN_TOUCH, ((signed char)data[6] > -80) && !(data[5] & 0x20)); input_report_key(input, BTN_STYLUS, (data[5] & 0x40)); break; default: dev_dbg(input->dev.parent, "%s: received unknown report #%d\n", __func__, data[0]); return 0; } return 1; } static int wacom_pl_irq(struct wacom_wac *wacom) { struct wacom_features *features = &wacom->features; unsigned char *data = wacom->data; struct input_dev *input = wacom->pen_input; int prox, pressure; if (data[0] != WACOM_REPORT_PENABLED) { dev_dbg(input->dev.parent, "%s: received unknown report #%d\n", __func__, data[0]); return 0; } prox = data[1] & 0x40; if (!wacom->id[0]) { if ((data[0] & 0x10) || (data[4] & 0x20)) { wacom->tool[0] = BTN_TOOL_RUBBER; wacom->id[0] = ERASER_DEVICE_ID; } else { wacom->tool[0] = BTN_TOOL_PEN; wacom->id[0] = STYLUS_DEVICE_ID; } } /* If the eraser is in prox, STYLUS2 is always set. If we * mis-detected the type and notice that STYLUS2 isn't set * then force the eraser out of prox and let the pen in. */ if (wacom->tool[0] == BTN_TOOL_RUBBER && !(data[4] & 0x20)) { input_report_key(input, BTN_TOOL_RUBBER, 0); input_report_abs(input, ABS_MISC, 0); input_sync(input); wacom->tool[0] = BTN_TOOL_PEN; wacom->id[0] = STYLUS_DEVICE_ID; } if (prox) { pressure = (signed char)((data[7] << 1) | ((data[4] >> 2) & 1)); if (features->pressure_max > 255) pressure = (pressure << 1) | ((data[4] >> 6) & 1); pressure += (features->pressure_max + 1) / 2; input_report_abs(input, ABS_X, data[3] | (data[2] << 7) | ((data[1] & 0x03) << 14)); input_report_abs(input, ABS_Y, data[6] | (data[5] << 7) | ((data[4] & 0x03) << 14)); input_report_abs(input, ABS_PRESSURE, pressure); input_report_key(input, BTN_TOUCH, data[4] & 0x08); input_report_key(input, BTN_STYLUS, data[4] & 0x10); /* Only allow the stylus2 button to be reported for the pen tool. */ input_report_key(input, BTN_STYLUS2, (wacom->tool[0] == BTN_TOOL_PEN) && (data[4] & 0x20)); } if (!prox) wacom->id[0] = 0; input_report_key(input, wacom->tool[0], prox); input_report_abs(input, ABS_MISC, wacom->id[0]); return 1; } static int wacom_ptu_irq(struct wacom_wac *wacom) { unsigned char *data = wacom->data; struct input_dev *input = wacom->pen_input; if (data[0] != WACOM_REPORT_PENABLED) { dev_dbg(input->dev.parent, "%s: received unknown report #%d\n", __func__, data[0]); return 0; } if (data[1] & 0x04) { input_report_key(input, BTN_TOOL_RUBBER, data[1] & 0x20); input_report_key(input, BTN_TOUCH, data[1] & 0x08); wacom->id[0] = ERASER_DEVICE_ID; } else { input_report_key(input, BTN_TOOL_PEN, data[1] & 0x20); input_report_key(input, BTN_TOUCH, data[1] & 0x01); wacom->id[0] = STYLUS_DEVICE_ID; } input_report_abs(input, ABS_MISC, wacom->id[0]); /* report tool id */ input_report_abs(input, ABS_X, le16_to_cpup((__le16 *)&data[2])); input_report_abs(input, ABS_Y, le16_to_cpup((__le16 *)&data[4])); input_report_abs(input, ABS_PRESSURE, le16_to_cpup((__le16 *)&data[6])); input_report_key(input, BTN_STYLUS, data[1] & 0x02); input_report_key(input, BTN_STYLUS2, data[1] & 0x10); return 1; } static int wacom_dtu_irq(struct wacom_wac *wacom) { unsigned char *data = wacom->data; struct input_dev *input = wacom->pen_input; int prox = data[1] & 0x20; dev_dbg(input->dev.parent, "%s: received report #%d", __func__, data[0]); if (prox) { /* Going into proximity select tool */ wacom->tool[0] = (data[1] & 0x0c) ? BTN_TOOL_RUBBER : BTN_TOOL_PEN; if (wacom->tool[0] == BTN_TOOL_PEN) wacom->id[0] = STYLUS_DEVICE_ID; else wacom->id[0] = ERASER_DEVICE_ID; } input_report_key(input, BTN_STYLUS, data[1] & 0x02); input_report_key(input, BTN_STYLUS2, data[1] & 0x10); input_report_abs(input, ABS_X, le16_to_cpup((__le16 *)&data[2])); input_report_abs(input, ABS_Y, le16_to_cpup((__le16 *)&data[4])); input_report_abs(input, ABS_PRESSURE, ((data[7] & 0x01) << 8) | data[6]); input_report_key(input, BTN_TOUCH, data[1] & 0x05); if (!prox) /* out-prox */ wacom->id[0] = 0; input_report_key(input, wacom->tool[0], prox); input_report_abs(input, ABS_MISC, wacom->id[0]); return 1; } static int wacom_dtus_irq(struct wacom_wac *wacom) { unsigned char *data = wacom->data; struct input_dev *input = wacom->pen_input; unsigned short prox, pressure = 0; if (data[0] != WACOM_REPORT_DTUS && data[0] != WACOM_REPORT_DTUSPAD) { dev_dbg(input->dev.parent, "%s: received unknown report #%d", __func__, data[0]); return 0; } else if (data[0] == WACOM_REPORT_DTUSPAD) { input = wacom->pad_input; input_report_key(input, BTN_0, (data[1] & 0x01)); input_report_key(input, BTN_1, (data[1] & 0x02)); input_report_key(input, BTN_2, (data[1] & 0x04)); input_report_key(input, BTN_3, (data[1] & 0x08)); input_report_abs(input, ABS_MISC, data[1] & 0x0f ? PAD_DEVICE_ID : 0); return 1; } else { prox = data[1] & 0x80; if (prox) { switch ((data[1] >> 3) & 3) { case 1: /* Rubber */ wacom->tool[0] = BTN_TOOL_RUBBER; wacom->id[0] = ERASER_DEVICE_ID; break; case 2: /* Pen */ wacom->tool[0] = BTN_TOOL_PEN; wacom->id[0] = STYLUS_DEVICE_ID; break; } } input_report_key(input, BTN_STYLUS, data[1] & 0x20); input_report_key(input, BTN_STYLUS2, data[1] & 0x40); input_report_abs(input, ABS_X, get_unaligned_be16(&data[3])); input_report_abs(input, ABS_Y, get_unaligned_be16(&data[5])); pressure = ((data[1] & 0x03) << 8) | (data[2] & 0xff); input_report_abs(input, ABS_PRESSURE, pressure); input_report_key(input, BTN_TOUCH, pressure > 10); if (!prox) /* out-prox */ wacom->id[0] = 0; input_report_key(input, wacom->tool[0], prox); input_report_abs(input, ABS_MISC, wacom->id[0]); return 1; } } static int wacom_graphire_irq(struct wacom_wac *wacom) { struct wacom_features *features = &wacom->features; unsigned char *data = wacom->data; struct input_dev *input = wacom->pen_input; struct input_dev *pad_input = wacom->pad_input; int battery_capacity, ps_connected; int prox; int rw = 0; int retval = 0; if (features->type == GRAPHIRE_BT) { if (data[0] != WACOM_REPORT_PENABLED_BT) { dev_dbg(input->dev.parent, "%s: received unknown report #%d\n", __func__, data[0]); goto exit; } } else if (data[0] != WACOM_REPORT_PENABLED) { dev_dbg(input->dev.parent, "%s: received unknown report #%d\n", __func__, data[0]); goto exit; } prox = data[1] & 0x80; if (prox || wacom->id[0]) { if (prox) { switch ((data[1] >> 5) & 3) { case 0: /* Pen */ wacom->tool[0] = BTN_TOOL_PEN; wacom->id[0] = STYLUS_DEVICE_ID; break; case 1: /* Rubber */ wacom->tool[0] = BTN_TOOL_RUBBER; wacom->id[0] = ERASER_DEVICE_ID; break; case 2: /* Mouse with wheel */ input_report_key(input, BTN_MIDDLE, data[1] & 0x04); fallthrough; case 3: /* Mouse without wheel */ wacom->tool[0] = BTN_TOOL_MOUSE; wacom->id[0] = CURSOR_DEVICE_ID; break; } } input_report_abs(input, ABS_X, le16_to_cpup((__le16 *)&data[2])); input_report_abs(input, ABS_Y, le16_to_cpup((__le16 *)&data[4])); if (wacom->tool[0] != BTN_TOOL_MOUSE) { if (features->type == GRAPHIRE_BT) input_report_abs(input, ABS_PRESSURE, data[6] | (((__u16) (data[1] & 0x08)) << 5)); else input_report_abs(input, ABS_PRESSURE, data[6] | ((data[7] & 0x03) << 8)); input_report_key(input, BTN_TOUCH, data[1] & 0x01); input_report_key(input, BTN_STYLUS, data[1] & 0x02); input_report_key(input, BTN_STYLUS2, data[1] & 0x04); } else { input_report_key(input, BTN_LEFT, data[1] & 0x01); input_report_key(input, BTN_RIGHT, data[1] & 0x02); if (features->type == WACOM_G4 || features->type == WACOM_MO) { input_report_abs(input, ABS_DISTANCE, data[6] & 0x3f); rw = (data[7] & 0x04) - (data[7] & 0x03); } else if (features->type == GRAPHIRE_BT) { /* Compute distance between mouse and tablet */ rw = 44 - (data[6] >> 2); rw = clamp_val(rw, 0, 31); input_report_abs(input, ABS_DISTANCE, rw); if (((data[1] >> 5) & 3) == 2) { /* Mouse with wheel */ input_report_key(input, BTN_MIDDLE, data[1] & 0x04); rw = (data[6] & 0x01) ? -1 : (data[6] & 0x02) ? 1 : 0; } else { rw = 0; } } else { input_report_abs(input, ABS_DISTANCE, data[7] & 0x3f); rw = -(signed char)data[6]; } input_report_rel(input, REL_WHEEL, rw); } if (!prox) wacom->id[0] = 0; input_report_abs(input, ABS_MISC, wacom->id[0]); /* report tool id */ input_report_key(input, wacom->tool[0], prox); input_sync(input); /* sync last event */ } /* send pad data */ switch (features->type) { case WACOM_G4: prox = data[7] & 0xf8; if (prox || wacom->id[1]) { wacom->id[1] = PAD_DEVICE_ID; input_report_key(pad_input, BTN_BACK, (data[7] & 0x40)); input_report_key(pad_input, BTN_FORWARD, (data[7] & 0x80)); rw = ((data[7] & 0x18) >> 3) - ((data[7] & 0x20) >> 3); input_report_rel(pad_input, REL_WHEEL, rw); if (!prox) wacom->id[1] = 0; input_report_abs(pad_input, ABS_MISC, wacom->id[1]); retval = 1; } break; case WACOM_MO: prox = (data[7] & 0xf8) || data[8]; if (prox || wacom->id[1]) { wacom->id[1] = PAD_DEVICE_ID; input_report_key(pad_input, BTN_BACK, (data[7] & 0x08)); input_report_key(pad_input, BTN_LEFT, (data[7] & 0x20)); input_report_key(pad_input, BTN_FORWARD, (data[7] & 0x10)); input_report_key(pad_input, BTN_RIGHT, (data[7] & 0x40)); input_report_abs(pad_input, ABS_WHEEL, (data[8] & 0x7f)); if (!prox) wacom->id[1] = 0; input_report_abs(pad_input, ABS_MISC, wacom->id[1]); retval = 1; } break; case GRAPHIRE_BT: prox = data[7] & 0x03; if (prox || wacom->id[1]) { wacom->id[1] = PAD_DEVICE_ID; input_report_key(pad_input, BTN_0, (data[7] & 0x02)); input_report_key(pad_input, BTN_1, (data[7] & 0x01)); if (!prox) wacom->id[1] = 0; input_report_abs(pad_input, ABS_MISC, wacom->id[1]); retval = 1; } break; } /* Store current battery capacity and power supply state */ if (features->type == GRAPHIRE_BT) { rw = (data[7] >> 2 & 0x07); battery_capacity = batcap_gr[rw]; ps_connected = rw == 7; wacom_notify_battery(wacom, WACOM_POWER_SUPPLY_STATUS_AUTO, battery_capacity, ps_connected, 1, ps_connected); } exit: return retval; } static void wacom_intuos_schedule_prox_event(struct wacom_wac *wacom_wac) { struct wacom *wacom = container_of(wacom_wac, struct wacom, wacom_wac); struct wacom_features *features = &wacom_wac->features; struct hid_report *r; struct hid_report_enum *re; re = &(wacom->hdev->report_enum[HID_FEATURE_REPORT]); if (features->type == INTUOSHT2) r = re->report_id_hash[WACOM_REPORT_INTUOSHT2_ID]; else r = re->report_id_hash[WACOM_REPORT_INTUOS_ID1]; if (r) { hid_hw_request(wacom->hdev, r, HID_REQ_GET_REPORT); } } static int wacom_intuos_pad(struct wacom_wac *wacom) { struct wacom_features *features = &wacom->features; unsigned char *data = wacom->data; struct input_dev *input = wacom->pad_input; int i; int buttons = 0, nbuttons = features->numbered_buttons; int keys = 0, nkeys = 0; int ring1 = 0, ring2 = 0; int strip1 = 0, strip2 = 0; bool prox = false; bool wrench = false, keyboard = false, mute_touch = false, menu = false, info = false; /* pad packets. Works as a second tool and is always in prox */ if (!(data[0] == WACOM_REPORT_INTUOSPAD || data[0] == WACOM_REPORT_INTUOS5PAD || data[0] == WACOM_REPORT_CINTIQPAD)) return 0; if (features->type >= INTUOS4S && features->type <= INTUOS4L) { buttons = (data[3] << 1) | (data[2] & 0x01); ring1 = data[1]; } else if (features->type == DTK) { buttons = data[6]; } else if (features->type == WACOM_13HD) { buttons = (data[4] << 1) | (data[3] & 0x01); } else if (features->type == WACOM_24HD) { buttons = (data[8] << 8) | data[6]; ring1 = data[1]; ring2 = data[2]; /* * Three "buttons" are available on the 24HD which are * physically implemented as a touchstrip. Each button * is approximately 3 bits wide with a 2 bit spacing. * The raw touchstrip bits are stored at: * ((data[3] & 0x1f) << 8) | data[4]) */ nkeys = 3; keys = ((data[3] & 0x1C) ? 1<<2 : 0) | ((data[4] & 0xE0) ? 1<<1 : 0) | ((data[4] & 0x07) ? 1<<0 : 0); keyboard = !!(data[4] & 0xE0); info = !!(data[3] & 0x1C); if (features->oPid) { mute_touch = !!(data[4] & 0x07); if (mute_touch) wacom->shared->is_touch_on = !wacom->shared->is_touch_on; } else { wrench = !!(data[4] & 0x07); } } else if (features->type == WACOM_27QHD) { nkeys = 3; keys = data[2] & 0x07; wrench = !!(data[2] & 0x01); keyboard = !!(data[2] & 0x02); if (features->oPid) { mute_touch = !!(data[2] & 0x04); if (mute_touch) wacom->shared->is_touch_on = !wacom->shared->is_touch_on; } else { menu = !!(data[2] & 0x04); } input_report_abs(input, ABS_X, be16_to_cpup((__be16 *)&data[4])); input_report_abs(input, ABS_Y, be16_to_cpup((__be16 *)&data[6])); input_report_abs(input, ABS_Z, be16_to_cpup((__be16 *)&data[8])); } else if (features->type == CINTIQ_HYBRID) { /* * Do not send hardware buttons under Android. They * are already sent to the system through GPIO (and * have different meaning). * * d-pad right -> data[4] & 0x10 * d-pad up -> data[4] & 0x20 * d-pad left -> data[4] & 0x40 * d-pad down -> data[4] & 0x80 * d-pad center -> data[3] & 0x01 */ buttons = (data[4] << 1) | (data[3] & 0x01); } else if (features->type == CINTIQ_COMPANION_2) { /* d-pad right -> data[2] & 0x10 * d-pad up -> data[2] & 0x20 * d-pad left -> data[2] & 0x40 * d-pad down -> data[2] & 0x80 * d-pad center -> data[1] & 0x01 */ buttons = ((data[2] >> 4) << 7) | ((data[1] & 0x04) << 4) | ((data[2] & 0x0F) << 2) | (data[1] & 0x03); } else if (features->type >= INTUOS5S && features->type <= INTUOSPL) { /* * ExpressKeys on Intuos5/Intuos Pro have a capacitive sensor in * addition to the mechanical switch. Switch data is * stored in data[4], capacitive data in data[5]. * * Touch ring mode switch (data[3]) has no capacitive sensor */ buttons = (data[4] << 1) | (data[3] & 0x01); ring1 = data[2]; } else { if (features->type == WACOM_21UX2 || features->type == WACOM_22HD) { buttons = (data[8] << 10) | ((data[7] & 0x01) << 9) | (data[6] << 1) | (data[5] & 0x01); if (features->type == WACOM_22HD) { nkeys = 3; keys = data[9] & 0x07; info = !!(data[9] & 0x01); wrench = !!(data[9] & 0x02); } } else { buttons = ((data[6] & 0x10) << 5) | ((data[5] & 0x10) << 4) | ((data[6] & 0x0F) << 4) | (data[5] & 0x0F); } strip1 = ((data[1] & 0x1f) << 8) | data[2]; strip2 = ((data[3] & 0x1f) << 8) | data[4]; } prox = (buttons & ~(~0U << nbuttons)) | (keys & ~(~0U << nkeys)) | (ring1 & 0x80) | (ring2 & 0x80) | strip1 | strip2; wacom_report_numbered_buttons(input, nbuttons, buttons); for (i = 0; i < nkeys; i++) input_report_key(input, KEY_PROG1 + i, keys & (1 << i)); input_report_key(input, KEY_BUTTONCONFIG, wrench); input_report_key(input, KEY_ONSCREEN_KEYBOARD, keyboard); input_report_key(input, KEY_CONTROLPANEL, menu); input_report_key(input, KEY_INFO, info); if (wacom->shared && wacom->shared->touch_input) { input_report_switch(wacom->shared->touch_input, SW_MUTE_DEVICE, !wacom->shared->is_touch_on); input_sync(wacom->shared->touch_input); } input_report_abs(input, ABS_RX, strip1); input_report_abs(input, ABS_RY, strip2); input_report_abs(input, ABS_WHEEL, (ring1 & 0x80) ? (ring1 & 0x7f) : 0); input_report_abs(input, ABS_THROTTLE, (ring2 & 0x80) ? (ring2 & 0x7f) : 0); input_report_key(input, wacom->tool[1], prox ? 1 : 0); input_report_abs(input, ABS_MISC, prox ? PAD_DEVICE_ID : 0); input_event(input, EV_MSC, MSC_SERIAL, 0xffffffff); return 1; } static int wacom_intuos_id_mangle(int tool_id) { return (tool_id & ~0xFFF) << 4 | (tool_id & 0xFFF); } static bool wacom_is_art_pen(int tool_id) { bool is_art_pen = false; switch (tool_id) { case 0x885: /* Intuos3 Marker Pen */ case 0x804: /* Intuos4/5 13HD/24HD Marker Pen */ case 0x10804: /* Intuos4/5 13HD/24HD Art Pen */ case 0x204: /* Art Pen 2 */ is_art_pen = true; break; } return is_art_pen; } static int wacom_intuos_get_tool_type(int tool_id) { switch (tool_id) { case 0x812: /* Inking pen */ case 0x801: /* Intuos3 Inking pen */ case 0x12802: /* Intuos4/5 Inking Pen */ case 0x012: return BTN_TOOL_PENCIL; case 0x832: /* Stroke pen */ case 0x032: return BTN_TOOL_BRUSH; case 0x007: /* Mouse 4D and 2D */ case 0x09c: case 0x094: case 0x017: /* Intuos3 2D Mouse */ case 0x806: /* Intuos4 Mouse */ return BTN_TOOL_MOUSE; case 0x096: /* Lens cursor */ case 0x097: /* Intuos3 Lens cursor */ case 0x006: /* Intuos4 Lens cursor */ return BTN_TOOL_LENS; case 0xd12: case 0x912: case 0x112: case 0x913: /* Intuos3 Airbrush */ case 0x902: /* Intuos4/5 13HD/24HD Airbrush */ case 0x10902: /* Intuos4/5 13HD/24HD Airbrush */ return BTN_TOOL_AIRBRUSH; default: if (tool_id & 0x0008) return BTN_TOOL_RUBBER; return BTN_TOOL_PEN; } } static void wacom_exit_report(struct wacom_wac *wacom) { struct input_dev *input = wacom->pen_input; struct wacom_features *features = &wacom->features; unsigned char *data = wacom->data; int idx = (features->type == INTUOS) ? (data[1] & 0x01) : 0; /* * Reset all states otherwise we lose the initial states * when in-prox next time */ input_report_abs(input, ABS_X, 0); input_report_abs(input, ABS_Y, 0); input_report_abs(input, ABS_DISTANCE, 0); input_report_abs(input, ABS_TILT_X, 0); input_report_abs(input, ABS_TILT_Y, 0); if (wacom->tool[idx] >= BTN_TOOL_MOUSE) { input_report_key(input, BTN_LEFT, 0); input_report_key(input, BTN_MIDDLE, 0); input_report_key(input, BTN_RIGHT, 0); input_report_key(input, BTN_SIDE, 0); input_report_key(input, BTN_EXTRA, 0); input_report_abs(input, ABS_THROTTLE, 0); input_report_abs(input, ABS_RZ, 0); } else { input_report_abs(input, ABS_PRESSURE, 0); input_report_key(input, BTN_STYLUS, 0); input_report_key(input, BTN_STYLUS2, 0); input_report_key(input, BTN_TOUCH, 0); input_report_abs(input, ABS_WHEEL, 0); if (features->type >= INTUOS3S) input_report_abs(input, ABS_Z, 0); } input_report_key(input, wacom->tool[idx], 0); input_report_abs(input, ABS_MISC, 0); /* reset tool id */ input_event(input, EV_MSC, MSC_SERIAL, wacom->serial[idx]); wacom->id[idx] = 0; } static int wacom_intuos_inout(struct wacom_wac *wacom) { struct wacom_features *features = &wacom->features; unsigned char *data = wacom->data; struct input_dev *input = wacom->pen_input; int idx = (features->type == INTUOS) ? (data[1] & 0x01) : 0; if (!(((data[1] & 0xfc) == 0xc0) || /* in prox */ ((data[1] & 0xfe) == 0x20) || /* in range */ ((data[1] & 0xfe) == 0x80))) /* out prox */ return 0; /* Enter report */ if ((data[1] & 0xfc) == 0xc0) { /* serial number of the tool */ wacom->serial[idx] = ((__u64)(data[3] & 0x0f) << 28) + (data[4] << 20) + (data[5] << 12) + (data[6] << 4) + (data[7] >> 4); wacom->id[idx] = (data[2] << 4) | (data[3] >> 4) | ((data[7] & 0x0f) << 16) | ((data[8] & 0xf0) << 8); wacom->tool[idx] = wacom_intuos_get_tool_type(wacom->id[idx]); wacom->shared->stylus_in_proximity = true; return 1; } /* in Range */ if ((data[1] & 0xfe) == 0x20) { if (features->type != INTUOSHT2) wacom->shared->stylus_in_proximity = true; /* in Range while exiting */ if (wacom->reporting_data) { input_report_key(input, BTN_TOUCH, 0); input_report_abs(input, ABS_PRESSURE, 0); input_report_abs(input, ABS_DISTANCE, wacom->features.distance_max); return 2; } return 1; } /* Exit report */ if ((data[1] & 0xfe) == 0x80) { wacom->shared->stylus_in_proximity = false; wacom->reporting_data = false; /* don't report exit if we don't know the ID */ if (!wacom->id[idx]) return 1; wacom_exit_report(wacom); return 2; } return 0; } static inline bool touch_is_muted(struct wacom_wac *wacom_wac) { return wacom_wac->probe_complete && wacom_wac->shared->has_mute_touch_switch && !wacom_wac->shared->is_touch_on; } static inline bool report_touch_events(struct wacom_wac *wacom) { return (touch_arbitration ? !wacom->shared->stylus_in_proximity : 1); } static inline bool delay_pen_events(struct wacom_wac *wacom) { return (wacom->shared->touch_down && touch_arbitration); } static int wacom_intuos_general(struct wacom_wac *wacom) { struct wacom_features *features = &wacom->features; unsigned char *data = wacom->data; struct input_dev *input = wacom->pen_input; int idx = (features->type == INTUOS) ? (data[1] & 0x01) : 0; unsigned char type = (data[1] >> 1) & 0x0F; unsigned int x, y, distance, t; if (data[0] != WACOM_REPORT_PENABLED && data[0] != WACOM_REPORT_CINTIQ && data[0] != WACOM_REPORT_INTUOS_PEN) return 0; if (delay_pen_events(wacom)) return 1; /* don't report events if we don't know the tool ID */ if (!wacom->id[idx]) { /* but reschedule a read of the current tool */ wacom_intuos_schedule_prox_event(wacom); return 1; } /* * don't report events for invalid data */ /* older I4 styli don't work with new Cintiqs */ if ((!((wacom->id[idx] >> 16) & 0x01) && (features->type == WACOM_21UX2)) || /* Only large Intuos support Lense Cursor */ (wacom->tool[idx] == BTN_TOOL_LENS && (features->type == INTUOS3 || features->type == INTUOS3S || features->type == INTUOS4 || features->type == INTUOS4S || features->type == INTUOS5 || features->type == INTUOS5S || features->type == INTUOSPM || features->type == INTUOSPS)) || /* Cintiq doesn't send data when RDY bit isn't set */ (features->type == CINTIQ && !(data[1] & 0x40))) return 1; x = (be16_to_cpup((__be16 *)&data[2]) << 1) | ((data[9] >> 1) & 1); y = (be16_to_cpup((__be16 *)&data[4]) << 1) | (data[9] & 1); distance = data[9] >> 2; if (features->type < INTUOS3S) { x >>= 1; y >>= 1; distance >>= 1; } if (features->type == INTUOSHT2) distance = features->distance_max - distance; input_report_abs(input, ABS_X, x); input_report_abs(input, ABS_Y, y); input_report_abs(input, ABS_DISTANCE, distance); switch (type) { case 0x00: case 0x01: case 0x02: case 0x03: /* general pen packet */ t = (data[6] << 3) | ((data[7] & 0xC0) >> 5) | (data[1] & 1); if (features->pressure_max < 2047) t >>= 1; input_report_abs(input, ABS_PRESSURE, t); if (features->type != INTUOSHT2) { input_report_abs(input, ABS_TILT_X, (((data[7] << 1) & 0x7e) | (data[8] >> 7)) - 64); input_report_abs(input, ABS_TILT_Y, (data[8] & 0x7f) - 64); } input_report_key(input, BTN_STYLUS, data[1] & 2); input_report_key(input, BTN_STYLUS2, data[1] & 4); input_report_key(input, BTN_TOUCH, t > 10); break; case 0x0a: /* airbrush second packet */ input_report_abs(input, ABS_WHEEL, (data[6] << 2) | ((data[7] >> 6) & 3)); input_report_abs(input, ABS_TILT_X, (((data[7] << 1) & 0x7e) | (data[8] >> 7)) - 64); input_report_abs(input, ABS_TILT_Y, (data[8] & 0x7f) - 64); break; case 0x05: /* Rotation packet */ if (features->type >= INTUOS3S) { /* I3 marker pen rotation */ t = (data[6] << 3) | ((data[7] >> 5) & 7); t = (data[7] & 0x20) ? ((t > 900) ? ((t-1) / 2 - 1350) : ((t-1) / 2 + 450)) : (450 - t / 2) ; input_report_abs(input, ABS_Z, t); } else { /* 4D mouse 2nd packet */ t = (data[6] << 3) | ((data[7] >> 5) & 7); input_report_abs(input, ABS_RZ, (data[7] & 0x20) ? ((t - 1) / 2) : -t / 2); } break; case 0x04: /* 4D mouse 1st packet */ input_report_key(input, BTN_LEFT, data[8] & 0x01); input_report_key(input, BTN_MIDDLE, data[8] & 0x02); input_report_key(input, BTN_RIGHT, data[8] & 0x04); input_report_key(input, BTN_SIDE, data[8] & 0x20); input_report_key(input, BTN_EXTRA, data[8] & 0x10); t = (data[6] << 2) | ((data[7] >> 6) & 3); input_report_abs(input, ABS_THROTTLE, (data[8] & 0x08) ? -t : t); break; case 0x06: /* I4 mouse */ input_report_key(input, BTN_LEFT, data[6] & 0x01); input_report_key(input, BTN_MIDDLE, data[6] & 0x02); input_report_key(input, BTN_RIGHT, data[6] & 0x04); input_report_rel(input, REL_WHEEL, ((data[7] & 0x80) >> 7) - ((data[7] & 0x40) >> 6)); input_report_key(input, BTN_SIDE, data[6] & 0x08); input_report_key(input, BTN_EXTRA, data[6] & 0x10); input_report_abs(input, ABS_TILT_X, (((data[7] << 1) & 0x7e) | (data[8] >> 7)) - 64); input_report_abs(input, ABS_TILT_Y, (data[8] & 0x7f) - 64); break; case 0x08: if (wacom->tool[idx] == BTN_TOOL_MOUSE) { /* 2D mouse packet */ input_report_key(input, BTN_LEFT, data[8] & 0x04); input_report_key(input, BTN_MIDDLE, data[8] & 0x08); input_report_key(input, BTN_RIGHT, data[8] & 0x10); input_report_rel(input, REL_WHEEL, (data[8] & 0x01) - ((data[8] & 0x02) >> 1)); /* I3 2D mouse side buttons */ if (features->type >= INTUOS3S && features->type <= INTUOS3L) { input_report_key(input, BTN_SIDE, data[8] & 0x40); input_report_key(input, BTN_EXTRA, data[8] & 0x20); } } else if (wacom->tool[idx] == BTN_TOOL_LENS) { /* Lens cursor packets */ input_report_key(input, BTN_LEFT, data[8] & 0x01); input_report_key(input, BTN_MIDDLE, data[8] & 0x02); input_report_key(input, BTN_RIGHT, data[8] & 0x04); input_report_key(input, BTN_SIDE, data[8] & 0x10); input_report_key(input, BTN_EXTRA, data[8] & 0x08); } break; case 0x07: case 0x09: case 0x0b: case 0x0c: case 0x0d: case 0x0e: case 0x0f: /* unhandled */ break; } input_report_abs(input, ABS_MISC, wacom_intuos_id_mangle(wacom->id[idx])); /* report tool id */ input_report_key(input, wacom->tool[idx], 1); input_event(input, EV_MSC, MSC_SERIAL, wacom->serial[idx]); wacom->reporting_data = true; return 2; } static int wacom_intuos_irq(struct wacom_wac *wacom) { unsigned char *data = wacom->data; struct input_dev *input = wacom->pen_input; int result; if (data[0] != WACOM_REPORT_PENABLED && data[0] != WACOM_REPORT_INTUOS_ID1 && data[0] != WACOM_REPORT_INTUOS_ID2 && data[0] != WACOM_REPORT_INTUOSPAD && data[0] != WACOM_REPORT_INTUOS_PEN && data[0] != WACOM_REPORT_CINTIQ && data[0] != WACOM_REPORT_CINTIQPAD && data[0] != WACOM_REPORT_INTUOS5PAD) { dev_dbg(input->dev.parent, "%s: received unknown report #%d\n", __func__, data[0]); return 0; } /* process pad events */ result = wacom_intuos_pad(wacom); if (result) return result; /* process in/out prox events */ result = wacom_intuos_inout(wacom); if (result) return result - 1; /* process general packets */ result = wacom_intuos_general(wacom); if (result) return result - 1; return 0; } static int wacom_remote_irq(struct wacom_wac *wacom_wac, size_t len) { unsigned char *data = wacom_wac->data; struct input_dev *input; struct wacom *wacom = container_of(wacom_wac, struct wacom, wacom_wac); struct wacom_remote *remote = wacom->remote; int bat_charging, bat_percent, touch_ring_mode; __u32 serial; int i, index = -1; unsigned long flags; if (data[0] != WACOM_REPORT_REMOTE) { hid_dbg(wacom->hdev, "%s: received unknown report #%d", __func__, data[0]); return 0; } serial = data[3] + (data[4] << 8) + (data[5] << 16); wacom_wac->id[0] = PAD_DEVICE_ID; spin_lock_irqsave(&remote->remote_lock, flags); for (i = 0; i < WACOM_MAX_REMOTES; i++) { if (remote->remotes[i].serial == serial) { index = i; break; } } if (index < 0 || !remote->remotes[index].registered) goto out; remote->remotes[i].active_time = ktime_get(); input = remote->remotes[index].input; input_report_key(input, BTN_0, (data[9] & 0x01)); input_report_key(input, BTN_1, (data[9] & 0x02)); input_report_key(input, BTN_2, (data[9] & 0x04)); input_report_key(input, BTN_3, (data[9] & 0x08)); input_report_key(input, BTN_4, (data[9] & 0x10)); input_report_key(input, BTN_5, (data[9] & 0x20)); input_report_key(input, BTN_6, (data[9] & 0x40)); input_report_key(input, BTN_7, (data[9] & 0x80)); input_report_key(input, BTN_8, (data[10] & 0x01)); input_report_key(input, BTN_9, (data[10] & 0x02)); input_report_key(input, BTN_A, (data[10] & 0x04)); input_report_key(input, BTN_B, (data[10] & 0x08)); input_report_key(input, BTN_C, (data[10] & 0x10)); input_report_key(input, BTN_X, (data[10] & 0x20)); input_report_key(input, BTN_Y, (data[10] & 0x40)); input_report_key(input, BTN_Z, (data[10] & 0x80)); input_report_key(input, BTN_BASE, (data[11] & 0x01)); input_report_key(input, BTN_BASE2, (data[11] & 0x02)); if (data[12] & 0x80) input_report_abs(input, ABS_WHEEL, (data[12] & 0x7f) - 1); else input_report_abs(input, ABS_WHEEL, 0); bat_percent = data[7] & 0x7f; bat_charging = !!(data[7] & 0x80); if (data[9] | data[10] | (data[11] & 0x03) | data[12]) input_report_abs(input, ABS_MISC, PAD_DEVICE_ID); else input_report_abs(input, ABS_MISC, 0); input_event(input, EV_MSC, MSC_SERIAL, serial); input_sync(input); /*Which mode select (LED light) is currently on?*/ touch_ring_mode = (data[11] & 0xC0) >> 6; for (i = 0; i < WACOM_MAX_REMOTES; i++) { if (remote->remotes[i].serial == serial) wacom->led.groups[i].select = touch_ring_mode; } __wacom_notify_battery(&remote->remotes[index].battery, WACOM_POWER_SUPPLY_STATUS_AUTO, bat_percent, bat_charging, 1, bat_charging); out: spin_unlock_irqrestore(&remote->remote_lock, flags); return 0; } static void wacom_remote_status_irq(struct wacom_wac *wacom_wac, size_t len) { struct wacom *wacom = container_of(wacom_wac, struct wacom, wacom_wac); unsigned char *data = wacom_wac->data; struct wacom_remote *remote = wacom->remote; struct wacom_remote_work_data remote_data; unsigned long flags; int i, ret; if (data[0] != WACOM_REPORT_DEVICE_LIST) return; memset(&remote_data, 0, sizeof(struct wacom_remote_work_data)); for (i = 0; i < WACOM_MAX_REMOTES; i++) { int j = i * 6; int serial = (data[j+6] << 16) + (data[j+5] << 8) + data[j+4]; remote_data.remote[i].serial = serial; } spin_lock_irqsave(&remote->remote_lock, flags); ret = kfifo_in(&remote->remote_fifo, &remote_data, sizeof(remote_data)); if (ret != sizeof(remote_data)) { spin_unlock_irqrestore(&remote->remote_lock, flags); hid_err(wacom->hdev, "Can't queue Remote status event.\n"); return; } spin_unlock_irqrestore(&remote->remote_lock, flags); wacom_schedule_work(wacom_wac, WACOM_WORKER_REMOTE); } static int int_dist(int x1, int y1, int x2, int y2) { int x = x2 - x1; int y = y2 - y1; return int_sqrt(x*x + y*y); } static void wacom_intuos_bt_process_data(struct wacom_wac *wacom, unsigned char *data) { memcpy(wacom->data, data, 10); wacom_intuos_irq(wacom); input_sync(wacom->pen_input); if (wacom->pad_input) input_sync(wacom->pad_input); } static int wacom_intuos_bt_irq(struct wacom_wac *wacom, size_t len) { u8 *data = kmemdup(wacom->data, len, GFP_KERNEL); int i = 1; unsigned power_raw, battery_capacity, bat_charging, ps_connected; switch (data[0]) { case 0x04: wacom_intuos_bt_process_data(wacom, data + i); i += 10; fallthrough; case 0x03: wacom_intuos_bt_process_data(wacom, data + i); i += 10; wacom_intuos_bt_process_data(wacom, data + i); i += 10; power_raw = data[i]; bat_charging = (power_raw & 0x08) ? 1 : 0; ps_connected = (power_raw & 0x10) ? 1 : 0; battery_capacity = batcap_i4[power_raw & 0x07]; wacom_notify_battery(wacom, WACOM_POWER_SUPPLY_STATUS_AUTO, battery_capacity, bat_charging, battery_capacity || bat_charging, ps_connected); break; default: dev_dbg(wacom->pen_input->dev.parent, "Unknown report: %d,%d size:%zu\n", data[0], data[1], len); break; } kfree(data); return 0; } static int wacom_wac_finger_count_touches(struct wacom_wac *wacom) { struct input_dev *input = wacom->touch_input; unsigned touch_max = wacom->features.touch_max; int count = 0; int i; if (!touch_max) return 0; if (touch_max == 1) return test_bit(BTN_TOUCH, input->key) && report_touch_events(wacom); for (i = 0; i < input->mt->num_slots; i++) { struct input_mt_slot *ps = &input->mt->slots[i]; int id = input_mt_get_value(ps, ABS_MT_TRACKING_ID); if (id >= 0) count++; } return count; } static void wacom_intuos_pro2_bt_pen(struct wacom_wac *wacom) { int pen_frame_len, pen_frames; struct input_dev *pen_input = wacom->pen_input; unsigned char *data = wacom->data; int number_of_valid_frames = 0; ktime_t time_interval = 15000000; ktime_t time_packet_received = ktime_get(); int i; if (wacom->features.type == INTUOSP2_BT || wacom->features.type == INTUOSP2S_BT) { wacom->serial[0] = get_unaligned_le64(&data[99]); wacom->id[0] = get_unaligned_le16(&data[107]); pen_frame_len = 14; pen_frames = 7; } else { wacom->serial[0] = get_unaligned_le64(&data[33]); wacom->id[0] = get_unaligned_le16(&data[41]); pen_frame_len = 8; pen_frames = 4; } if (wacom->serial[0] >> 52 == 1) { /* Add back in missing bits of ID for non-USI pens */ wacom->id[0] |= (wacom->serial[0] >> 32) & 0xFFFFF; } /* number of valid frames */ for (i = 0; i < pen_frames; i++) { unsigned char *frame = &data[i*pen_frame_len + 1]; bool valid = frame[0] & 0x80; if (valid) number_of_valid_frames++; } if (number_of_valid_frames) { if (wacom->hid_data.time_delayed) time_interval = ktime_get() - wacom->hid_data.time_delayed; time_interval = div_u64(time_interval, number_of_valid_frames); wacom->hid_data.time_delayed = time_packet_received; } for (i = 0; i < number_of_valid_frames; i++) { unsigned char *frame = &data[i*pen_frame_len + 1]; bool valid = frame[0] & 0x80; bool prox = frame[0] & 0x40; bool range = frame[0] & 0x20; bool invert = frame[0] & 0x10; int frames_number_reversed = number_of_valid_frames - i - 1; ktime_t event_timestamp = time_packet_received - frames_number_reversed * time_interval; if (!valid) continue; if (!prox) { wacom->shared->stylus_in_proximity = false; wacom_exit_report(wacom); input_sync(pen_input); wacom->tool[0] = 0; wacom->id[0] = 0; wacom->serial[0] = 0; wacom->hid_data.time_delayed = 0; return; } if (range) { if (!wacom->tool[0]) { /* first in range */ /* Going into range select tool */ if (invert) wacom->tool[0] = BTN_TOOL_RUBBER; else if (wacom->id[0]) wacom->tool[0] = wacom_intuos_get_tool_type(wacom->id[0]); else wacom->tool[0] = BTN_TOOL_PEN; } input_report_abs(pen_input, ABS_X, get_unaligned_le16(&frame[1])); input_report_abs(pen_input, ABS_Y, get_unaligned_le16(&frame[3])); if (wacom->features.type == INTUOSP2_BT || wacom->features.type == INTUOSP2S_BT) { /* Fix rotation alignment: userspace expects zero at left */ int16_t rotation = (int16_t)get_unaligned_le16(&frame[9]); rotation += 1800/4; if (rotation > 899) rotation -= 1800; input_report_abs(pen_input, ABS_TILT_X, (signed char)frame[7]); input_report_abs(pen_input, ABS_TILT_Y, (signed char)frame[8]); input_report_abs(pen_input, ABS_Z, rotation); input_report_abs(pen_input, ABS_WHEEL, get_unaligned_le16(&frame[11])); } } if (wacom->tool[0]) { input_report_abs(pen_input, ABS_PRESSURE, get_unaligned_le16(&frame[5])); if (wacom->features.type == INTUOSP2_BT || wacom->features.type == INTUOSP2S_BT) { input_report_abs(pen_input, ABS_DISTANCE, range ? frame[13] : wacom->features.distance_max); } else { input_report_abs(pen_input, ABS_DISTANCE, range ? frame[7] : wacom->features.distance_max); } input_report_key(pen_input, BTN_TOUCH, frame[0] & 0x09); input_report_key(pen_input, BTN_STYLUS, frame[0] & 0x02); input_report_key(pen_input, BTN_STYLUS2, frame[0] & 0x04); input_report_key(pen_input, wacom->tool[0], prox); input_event(pen_input, EV_MSC, MSC_SERIAL, wacom->serial[0]); input_report_abs(pen_input, ABS_MISC, wacom_intuos_id_mangle(wacom->id[0])); /* report tool id */ } wacom->shared->stylus_in_proximity = prox; /* add timestamp to unpack the frames */ input_set_timestamp(pen_input, event_timestamp); input_sync(pen_input); } } static void wacom_intuos_pro2_bt_touch(struct wacom_wac *wacom) { const int finger_touch_len = 8; const int finger_frames = 4; const int finger_frame_len = 43; struct input_dev *touch_input = wacom->touch_input; unsigned char *data = wacom->data; int num_contacts_left = 5; int i, j; for (i = 0; i < finger_frames; i++) { unsigned char *frame = &data[i*finger_frame_len + 109]; int current_num_contacts = frame[0] & 0x7F; int contacts_to_send; if (!(frame[0] & 0x80)) continue; /* * First packet resets the counter since only the first * packet in series will have non-zero current_num_contacts. */ if (current_num_contacts) wacom->num_contacts_left = current_num_contacts; contacts_to_send = min(num_contacts_left, wacom->num_contacts_left); for (j = 0; j < contacts_to_send; j++) { unsigned char *touch = &frame[j*finger_touch_len + 1]; int slot = input_mt_get_slot_by_key(touch_input, touch[0]); int x = get_unaligned_le16(&touch[2]); int y = get_unaligned_le16(&touch[4]); int w = touch[6] * input_abs_get_res(touch_input, ABS_MT_POSITION_X); int h = touch[7] * input_abs_get_res(touch_input, ABS_MT_POSITION_Y); if (slot < 0) continue; input_mt_slot(touch_input, slot); input_mt_report_slot_state(touch_input, MT_TOOL_FINGER, touch[1] & 0x01); input_report_abs(touch_input, ABS_MT_POSITION_X, x); input_report_abs(touch_input, ABS_MT_POSITION_Y, y); input_report_abs(touch_input, ABS_MT_TOUCH_MAJOR, max(w, h)); input_report_abs(touch_input, ABS_MT_TOUCH_MINOR, min(w, h)); input_report_abs(touch_input, ABS_MT_ORIENTATION, w > h); } input_mt_sync_frame(touch_input); wacom->num_contacts_left -= contacts_to_send; if (wacom->num_contacts_left <= 0) { wacom->num_contacts_left = 0; wacom->shared->touch_down = wacom_wac_finger_count_touches(wacom); input_sync(touch_input); } } if (wacom->num_contacts_left == 0) { // Be careful that we don't accidentally call input_sync with // only a partial set of fingers of processed input_report_switch(touch_input, SW_MUTE_DEVICE, !(data[281] >> 7)); input_sync(touch_input); } } static void wacom_intuos_pro2_bt_pad(struct wacom_wac *wacom) { struct input_dev *pad_input = wacom->pad_input; unsigned char *data = wacom->data; int nbuttons = wacom->features.numbered_buttons; int expresskeys = data[282]; int center = (data[281] & 0x40) >> 6; int ring = data[285] & 0x7F; bool ringstatus = data[285] & 0x80; bool prox = expresskeys || center || ringstatus; /* Fix touchring data: userspace expects 0 at left and increasing clockwise */ ring = 71 - ring; ring += 3*72/16; if (ring > 71) ring -= 72; wacom_report_numbered_buttons(pad_input, nbuttons, expresskeys | (center << (nbuttons - 1))); input_report_abs(pad_input, ABS_WHEEL, ringstatus ? ring : 0); input_report_key(pad_input, wacom->tool[1], prox ? 1 : 0); input_report_abs(pad_input, ABS_MISC, prox ? PAD_DEVICE_ID : 0); input_event(pad_input, EV_MSC, MSC_SERIAL, 0xffffffff); input_sync(pad_input); } static void wacom_intuos_pro2_bt_battery(struct wacom_wac *wacom) { unsigned char *data = wacom->data; bool chg = data[284] & 0x80; int battery_status = data[284] & 0x7F; wacom_notify_battery(wacom, WACOM_POWER_SUPPLY_STATUS_AUTO, battery_status, chg, 1, chg); } static void wacom_intuos_gen3_bt_pad(struct wacom_wac *wacom) { struct input_dev *pad_input = wacom->pad_input; unsigned char *data = wacom->data; int buttons = data[44]; wacom_report_numbered_buttons(pad_input, 4, buttons); input_report_key(pad_input, wacom->tool[1], buttons ? 1 : 0); input_report_abs(pad_input, ABS_MISC, buttons ? PAD_DEVICE_ID : 0); input_event(pad_input, EV_MSC, MSC_SERIAL, 0xffffffff); input_sync(pad_input); } static void wacom_intuos_gen3_bt_battery(struct wacom_wac *wacom) { unsigned char *data = wacom->data; bool chg = data[45] & 0x80; int battery_status = data[45] & 0x7F; wacom_notify_battery(wacom, WACOM_POWER_SUPPLY_STATUS_AUTO, battery_status, chg, 1, chg); } static int wacom_intuos_pro2_bt_irq(struct wacom_wac *wacom, size_t len) { unsigned char *data = wacom->data; if (data[0] != 0x80 && data[0] != 0x81) { dev_dbg(wacom->pen_input->dev.parent, "%s: received unknown report #%d\n", __func__, data[0]); return 0; } wacom_intuos_pro2_bt_pen(wacom); if (wacom->features.type == INTUOSP2_BT || wacom->features.type == INTUOSP2S_BT) { wacom_intuos_pro2_bt_touch(wacom); wacom_intuos_pro2_bt_pad(wacom); wacom_intuos_pro2_bt_battery(wacom); } else { wacom_intuos_gen3_bt_pad(wacom); wacom_intuos_gen3_bt_battery(wacom); } return 0; } static int wacom_24hdt_irq(struct wacom_wac *wacom) { struct input_dev *input = wacom->touch_input; unsigned char *data = wacom->data; int i; int current_num_contacts = data[61]; int contacts_to_send = 0; int num_contacts_left = 4; /* maximum contacts per packet */ int byte_per_packet = WACOM_BYTES_PER_24HDT_PACKET; int y_offset = 2; if (touch_is_muted(wacom) && !wacom->shared->touch_down) return 0; if (wacom->features.type == WACOM_27QHDT) { current_num_contacts = data[63]; num_contacts_left = 10; byte_per_packet = WACOM_BYTES_PER_QHDTHID_PACKET; y_offset = 0; } /* * First packet resets the counter since only the first * packet in series will have non-zero current_num_contacts. */ if (current_num_contacts) wacom->num_contacts_left = current_num_contacts; contacts_to_send = min(num_contacts_left, wacom->num_contacts_left); for (i = 0; i < contacts_to_send; i++) { int offset = (byte_per_packet * i) + 1; bool touch = (data[offset] & 0x1) && report_touch_events(wacom); int slot = input_mt_get_slot_by_key(input, data[offset + 1]); if (slot < 0) continue; input_mt_slot(input, slot); input_mt_report_slot_state(input, MT_TOOL_FINGER, touch); if (touch) { int t_x = get_unaligned_le16(&data[offset + 2]); int t_y = get_unaligned_le16(&data[offset + 4 + y_offset]); input_report_abs(input, ABS_MT_POSITION_X, t_x); input_report_abs(input, ABS_MT_POSITION_Y, t_y); if (wacom->features.type != WACOM_27QHDT) { int c_x = get_unaligned_le16(&data[offset + 4]); int c_y = get_unaligned_le16(&data[offset + 8]); int w = get_unaligned_le16(&data[offset + 10]); int h = get_unaligned_le16(&data[offset + 12]); input_report_abs(input, ABS_MT_TOUCH_MAJOR, min(w,h)); input_report_abs(input, ABS_MT_WIDTH_MAJOR, min(w, h) + int_dist(t_x, t_y, c_x, c_y)); input_report_abs(input, ABS_MT_WIDTH_MINOR, min(w, h)); input_report_abs(input, ABS_MT_ORIENTATION, w > h); } } } input_mt_sync_frame(input); wacom->num_contacts_left -= contacts_to_send; if (wacom->num_contacts_left <= 0) { wacom->num_contacts_left = 0; wacom->shared->touch_down = wacom_wac_finger_count_touches(wacom); } return 1; } static int wacom_mt_touch(struct wacom_wac *wacom) { struct input_dev *input = wacom->touch_input; unsigned char *data = wacom->data; int i; int current_num_contacts = data[2]; int contacts_to_send = 0; int x_offset = 0; /* MTTPC does not support Height and Width */ if (wacom->features.type == MTTPC || wacom->features.type == MTTPC_B) x_offset = -4; /* * First packet resets the counter since only the first * packet in series will have non-zero current_num_contacts. */ if (current_num_contacts) wacom->num_contacts_left = current_num_contacts; /* There are at most 5 contacts per packet */ contacts_to_send = min(5, wacom->num_contacts_left); for (i = 0; i < contacts_to_send; i++) { int offset = (WACOM_BYTES_PER_MT_PACKET + x_offset) * i + 3; bool touch = (data[offset] & 0x1) && report_touch_events(wacom); int id = get_unaligned_le16(&data[offset + 1]); int slot = input_mt_get_slot_by_key(input, id); if (slot < 0) continue; input_mt_slot(input, slot); input_mt_report_slot_state(input, MT_TOOL_FINGER, touch); if (touch) { int x = get_unaligned_le16(&data[offset + x_offset + 7]); int y = get_unaligned_le16(&data[offset + x_offset + 9]); input_report_abs(input, ABS_MT_POSITION_X, x); input_report_abs(input, ABS_MT_POSITION_Y, y); } } input_mt_sync_frame(input); wacom->num_contacts_left -= contacts_to_send; if (wacom->num_contacts_left <= 0) { wacom->num_contacts_left = 0; wacom->shared->touch_down = wacom_wac_finger_count_touches(wacom); } return 1; } static int wacom_tpc_mt_touch(struct wacom_wac *wacom) { struct input_dev *input = wacom->touch_input; unsigned char *data = wacom->data; int i; for (i = 0; i < 2; i++) { int p = data[1] & (1 << i); bool touch = p && report_touch_events(wacom); input_mt_slot(input, i); input_mt_report_slot_state(input, MT_TOOL_FINGER, touch); if (touch) { int x = le16_to_cpup((__le16 *)&data[i * 2 + 2]) & 0x7fff; int y = le16_to_cpup((__le16 *)&data[i * 2 + 6]) & 0x7fff; input_report_abs(input, ABS_MT_POSITION_X, x); input_report_abs(input, ABS_MT_POSITION_Y, y); } } input_mt_sync_frame(input); /* keep touch state for pen event */ wacom->shared->touch_down = wacom_wac_finger_count_touches(wacom); return 1; } static int wacom_tpc_single_touch(struct wacom_wac *wacom, size_t len) { unsigned char *data = wacom->data; struct input_dev *input = wacom->touch_input; bool prox = report_touch_events(wacom); int x = 0, y = 0; if (wacom->features.touch_max > 1 || len > WACOM_PKGLEN_TPC2FG) return 0; if (len == WACOM_PKGLEN_TPC1FG) { prox = prox && (data[0] & 0x01); x = get_unaligned_le16(&data[1]); y = get_unaligned_le16(&data[3]); } else if (len == WACOM_PKGLEN_TPC1FG_B) { prox = prox && (data[2] & 0x01); x = get_unaligned_le16(&data[3]); y = get_unaligned_le16(&data[5]); } else { prox = prox && (data[1] & 0x01); x = le16_to_cpup((__le16 *)&data[2]); y = le16_to_cpup((__le16 *)&data[4]); } if (prox) { input_report_abs(input, ABS_X, x); input_report_abs(input, ABS_Y, y); } input_report_key(input, BTN_TOUCH, prox); /* keep touch state for pen events */ wacom->shared->touch_down = prox; return 1; } static int wacom_tpc_pen(struct wacom_wac *wacom) { unsigned char *data = wacom->data; struct input_dev *input = wacom->pen_input; bool prox = data[1] & 0x20; if (!wacom->shared->stylus_in_proximity) /* first in prox */ /* Going into proximity select tool */ wacom->tool[0] = (data[1] & 0x0c) ? BTN_TOOL_RUBBER : BTN_TOOL_PEN; /* keep pen state for touch events */ wacom->shared->stylus_in_proximity = prox; /* send pen events only when touch is up or forced out * or touch arbitration is off */ if (!delay_pen_events(wacom)) { input_report_key(input, BTN_STYLUS, data[1] & 0x02); input_report_key(input, BTN_STYLUS2, data[1] & 0x10); input_report_abs(input, ABS_X, le16_to_cpup((__le16 *)&data[2])); input_report_abs(input, ABS_Y, le16_to_cpup((__le16 *)&data[4])); input_report_abs(input, ABS_PRESSURE, ((data[7] & 0x07) << 8) | data[6]); input_report_key(input, BTN_TOUCH, data[1] & 0x05); input_report_key(input, wacom->tool[0], prox); return 1; } return 0; } static int wacom_tpc_irq(struct wacom_wac *wacom, size_t len) { unsigned char *data = wacom->data; if (wacom->pen_input) { dev_dbg(wacom->pen_input->dev.parent, "%s: received report #%d\n", __func__, data[0]); if (len == WACOM_PKGLEN_PENABLED || data[0] == WACOM_REPORT_PENABLED) return wacom_tpc_pen(wacom); } else if (wacom->touch_input) { dev_dbg(wacom->touch_input->dev.parent, "%s: received report #%d\n", __func__, data[0]); switch (len) { case WACOM_PKGLEN_TPC1FG: return wacom_tpc_single_touch(wacom, len); case WACOM_PKGLEN_TPC2FG: return wacom_tpc_mt_touch(wacom); default: switch (data[0]) { case WACOM_REPORT_TPC1FG: case WACOM_REPORT_TPCHID: case WACOM_REPORT_TPCST: case WACOM_REPORT_TPC1FGE: return wacom_tpc_single_touch(wacom, len); case WACOM_REPORT_TPCMT: case WACOM_REPORT_TPCMT2: return wacom_mt_touch(wacom); } } } return 0; } static int wacom_offset_rotation(struct input_dev *input, struct hid_usage *usage, int value, int num, int denom) { struct input_absinfo *abs = &input->absinfo[usage->code]; int range = (abs->maximum - abs->minimum + 1); value += num*range/denom; if (value > abs->maximum) value -= range; else if (value < abs->minimum) value += range; return value; } int wacom_equivalent_usage(int usage) { if ((usage & HID_USAGE_PAGE) == WACOM_HID_UP_WACOMDIGITIZER) { int subpage = (usage & 0xFF00) << 8; int subusage = (usage & 0xFF); if (subpage == WACOM_HID_SP_PAD || subpage == WACOM_HID_SP_BUTTON || subpage == WACOM_HID_SP_DIGITIZER || subpage == WACOM_HID_SP_DIGITIZERINFO || usage == WACOM_HID_WD_SENSE || usage == WACOM_HID_WD_SERIALHI || usage == WACOM_HID_WD_TOOLTYPE || usage == WACOM_HID_WD_DISTANCE || usage == WACOM_HID_WD_TOUCHSTRIP || usage == WACOM_HID_WD_TOUCHSTRIP2 || usage == WACOM_HID_WD_TOUCHRING || usage == WACOM_HID_WD_TOUCHRINGSTATUS || usage == WACOM_HID_WD_REPORT_VALID || usage == WACOM_HID_WD_BARRELSWITCH3 || usage == WACOM_HID_WD_SEQUENCENUMBER) { return usage; } if (subpage == HID_UP_UNDEFINED) subpage = HID_UP_DIGITIZER; return subpage | subusage; } if ((usage & HID_USAGE_PAGE) == WACOM_HID_UP_WACOMTOUCH) { int subpage = (usage & 0xFF00) << 8; int subusage = (usage & 0xFF); if (usage == WACOM_HID_WT_REPORT_VALID) return usage; if (subpage == HID_UP_UNDEFINED) subpage = WACOM_HID_SP_DIGITIZER; return subpage | subusage; } return usage; } static void wacom_map_usage(struct input_dev *input, struct hid_usage *usage, struct hid_field *field, __u8 type, __u16 code, int fuzz) { struct wacom *wacom = input_get_drvdata(input); struct wacom_wac *wacom_wac = &wacom->wacom_wac; struct wacom_features *features = &wacom_wac->features; int fmin = field->logical_minimum; int fmax = field->logical_maximum; unsigned int equivalent_usage = wacom_equivalent_usage(usage->hid); int resolution_code = code; int resolution; if (equivalent_usage == HID_DG_TWIST) { resolution_code = ABS_RZ; } resolution = hidinput_calc_abs_res(field, resolution_code); if (equivalent_usage == HID_GD_X) { fmin += features->offset_left; fmax -= features->offset_right; } if (equivalent_usage == HID_GD_Y) { fmin += features->offset_top; fmax -= features->offset_bottom; } usage->type = type; usage->code = code; switch (type) { case EV_ABS: input_set_abs_params(input, code, fmin, fmax, fuzz, 0); /* older tablet may miss physical usage */ if ((code == ABS_X || code == ABS_Y) && !resolution) { resolution = WACOM_INTUOS_RES; hid_warn(input, "Using default resolution for axis type 0x%x code 0x%x\n", type, code); } input_abs_set_res(input, code, resolution); break; case EV_REL: case EV_KEY: case EV_MSC: case EV_SW: input_set_capability(input, type, code); break; } } static void wacom_wac_battery_usage_mapping(struct hid_device *hdev, struct hid_field *field, struct hid_usage *usage) { return; } static void wacom_wac_battery_event(struct hid_device *hdev, struct hid_field *field, struct hid_usage *usage, __s32 value) { struct wacom *wacom = hid_get_drvdata(hdev); struct wacom_wac *wacom_wac = &wacom->wacom_wac; unsigned equivalent_usage = wacom_equivalent_usage(usage->hid); switch (equivalent_usage) { case HID_DG_BATTERYSTRENGTH: if (value == 0) { wacom_wac->hid_data.bat_status = POWER_SUPPLY_STATUS_UNKNOWN; } else { value = value * 100 / (field->logical_maximum - field->logical_minimum); wacom_wac->hid_data.battery_capacity = value; wacom_wac->hid_data.bat_connected = 1; wacom_wac->hid_data.bat_status = WACOM_POWER_SUPPLY_STATUS_AUTO; } wacom_wac->features.quirks |= WACOM_QUIRK_BATTERY; break; case WACOM_HID_WD_BATTERY_LEVEL: value = value * 100 / (field->logical_maximum - field->logical_minimum); wacom_wac->hid_data.battery_capacity = value; wacom_wac->hid_data.bat_connected = 1; wacom_wac->hid_data.bat_status = WACOM_POWER_SUPPLY_STATUS_AUTO; wacom_wac->features.quirks |= WACOM_QUIRK_BATTERY; break; case WACOM_HID_WD_BATTERY_CHARGING: wacom_wac->hid_data.bat_charging = value; wacom_wac->hid_data.ps_connected = value; wacom_wac->hid_data.bat_connected = 1; wacom_wac->hid_data.bat_status = WACOM_POWER_SUPPLY_STATUS_AUTO; wacom_wac->features.quirks |= WACOM_QUIRK_BATTERY; break; } } static void wacom_wac_battery_pre_report(struct hid_device *hdev, struct hid_report *report) { return; } static void wacom_wac_battery_report(struct hid_device *hdev, struct hid_report *report) { struct wacom *wacom = hid_get_drvdata(hdev); struct wacom_wac *wacom_wac = &wacom->wacom_wac; int status = wacom_wac->hid_data.bat_status; int capacity = wacom_wac->hid_data.battery_capacity; bool charging = wacom_wac->hid_data.bat_charging; bool connected = wacom_wac->hid_data.bat_connected; bool powered = wacom_wac->hid_data.ps_connected; wacom_notify_battery(wacom_wac, status, capacity, charging, connected, powered); } static void wacom_wac_pad_usage_mapping(struct hid_device *hdev, struct hid_field *field, struct hid_usage *usage) { struct wacom *wacom = hid_get_drvdata(hdev); struct wacom_wac *wacom_wac = &wacom->wacom_wac; struct wacom_features *features = &wacom_wac->features; struct input_dev *input = wacom_wac->pad_input; unsigned equivalent_usage = wacom_equivalent_usage(usage->hid); switch (equivalent_usage) { case WACOM_HID_WD_ACCELEROMETER_X: __set_bit(INPUT_PROP_ACCELEROMETER, input->propbit); wacom_map_usage(input, usage, field, EV_ABS, ABS_X, 0); features->device_type |= WACOM_DEVICETYPE_PAD; break; case WACOM_HID_WD_ACCELEROMETER_Y: __set_bit(INPUT_PROP_ACCELEROMETER, input->propbit); wacom_map_usage(input, usage, field, EV_ABS, ABS_Y, 0); features->device_type |= WACOM_DEVICETYPE_PAD; break; case WACOM_HID_WD_ACCELEROMETER_Z: __set_bit(INPUT_PROP_ACCELEROMETER, input->propbit); wacom_map_usage(input, usage, field, EV_ABS, ABS_Z, 0); features->device_type |= WACOM_DEVICETYPE_PAD; break; case WACOM_HID_WD_BUTTONCENTER: case WACOM_HID_WD_BUTTONHOME: case WACOM_HID_WD_BUTTONUP: case WACOM_HID_WD_BUTTONDOWN: case WACOM_HID_WD_BUTTONLEFT: case WACOM_HID_WD_BUTTONRIGHT: wacom_map_usage(input, usage, field, EV_KEY, wacom_numbered_button_to_key(features->numbered_buttons), 0); features->numbered_buttons++; features->device_type |= WACOM_DEVICETYPE_PAD; break; case WACOM_HID_WD_MUTE_DEVICE: /* softkey touch switch */ wacom_wac->is_soft_touch_switch = true; fallthrough; case WACOM_HID_WD_TOUCHONOFF: /* * These two usages, which are used to mute touch events, come * from the pad packet, but are reported on the touch * interface. Because the touch interface may not have * been created yet, we cannot call wacom_map_usage(). In * order to process the usages when we receive them, we set * the usage type and code directly. */ wacom_wac->has_mute_touch_switch = true; usage->type = EV_SW; usage->code = SW_MUTE_DEVICE; break; case WACOM_HID_WD_TOUCHSTRIP: wacom_map_usage(input, usage, field, EV_ABS, ABS_RX, 0); features->device_type |= WACOM_DEVICETYPE_PAD; break; case WACOM_HID_WD_TOUCHSTRIP2: wacom_map_usage(input, usage, field, EV_ABS, ABS_RY, 0); features->device_type |= WACOM_DEVICETYPE_PAD; break; case WACOM_HID_WD_TOUCHRING: if (field->flags & HID_MAIN_ITEM_RELATIVE) { wacom_wac->relring_count++; if (wacom_wac->relring_count == 1) { wacom_map_usage(input, usage, field, EV_REL, REL_WHEEL_HI_RES, 0); set_bit(REL_WHEEL, input->relbit); } else if (wacom_wac->relring_count == 2) { wacom_map_usage(input, usage, field, EV_REL, REL_HWHEEL_HI_RES, 0); set_bit(REL_HWHEEL, input->relbit); } } else { wacom_wac->absring_count++; if (wacom_wac->absring_count == 1) wacom_map_usage(input, usage, field, EV_ABS, ABS_WHEEL, 0); else if (wacom_wac->absring_count == 2) wacom_map_usage(input, usage, field, EV_ABS, ABS_THROTTLE, 0); } features->device_type |= WACOM_DEVICETYPE_PAD; break; case WACOM_HID_WD_TOUCHRINGSTATUS: /* * Only set up type/code association. Completely mapping * this usage may overwrite the axis resolution and range. */ usage->type = EV_ABS; usage->code = ABS_WHEEL; set_bit(EV_ABS, input->evbit); features->device_type |= WACOM_DEVICETYPE_PAD; break; case WACOM_HID_WD_BUTTONCONFIG: wacom_map_usage(input, usage, field, EV_KEY, KEY_BUTTONCONFIG, 0); features->device_type |= WACOM_DEVICETYPE_PAD; break; case WACOM_HID_WD_ONSCREEN_KEYBOARD: wacom_map_usage(input, usage, field, EV_KEY, KEY_ONSCREEN_KEYBOARD, 0); features->device_type |= WACOM_DEVICETYPE_PAD; break; case WACOM_HID_WD_CONTROLPANEL: wacom_map_usage(input, usage, field, EV_KEY, KEY_CONTROLPANEL, 0); features->device_type |= WACOM_DEVICETYPE_PAD; break; case WACOM_HID_WD_MODE_CHANGE: /* do not overwrite previous data */ if (!wacom_wac->has_mode_change) { wacom_wac->has_mode_change = true; wacom_wac->is_direct_mode = true; } features->device_type |= WACOM_DEVICETYPE_PAD; break; } switch (equivalent_usage & 0xfffffff0) { case WACOM_HID_WD_EXPRESSKEY00: wacom_map_usage(input, usage, field, EV_KEY, wacom_numbered_button_to_key(features->numbered_buttons), 0); features->numbered_buttons++; features->device_type |= WACOM_DEVICETYPE_PAD; break; } } static void wacom_wac_pad_event(struct hid_device *hdev, struct hid_field *field, struct hid_usage *usage, __s32 value) { struct wacom *wacom = hid_get_drvdata(hdev); struct wacom_wac *wacom_wac = &wacom->wacom_wac; struct input_dev *input = wacom_wac->pad_input; struct wacom_features *features = &wacom_wac->features; unsigned equivalent_usage = wacom_equivalent_usage(usage->hid); int i; bool do_report = false; /* * Avoid reporting this event and setting inrange_state if this usage * hasn't been mapped. */ if (!usage->type && equivalent_usage != WACOM_HID_WD_MODE_CHANGE) return; if (wacom_equivalent_usage(field->physical) == HID_DG_TABLETFUNCTIONKEY) { bool is_abs_touchring = usage->hid == WACOM_HID_WD_TOUCHRING && !(field->flags & HID_MAIN_ITEM_RELATIVE); if (!is_abs_touchring) wacom_wac->hid_data.inrange_state |= value; } /* Process touch switch state first since it is reported through touch interface, * which is indepentent of pad interface. In the case when there are no other pad * events, the pad interface will not even be created. */ if ((equivalent_usage == WACOM_HID_WD_MUTE_DEVICE) || (equivalent_usage == WACOM_HID_WD_TOUCHONOFF)) { if (wacom_wac->shared->touch_input) { bool *is_touch_on = &wacom_wac->shared->is_touch_on; if (equivalent_usage == WACOM_HID_WD_MUTE_DEVICE && value) *is_touch_on = !(*is_touch_on); else if (equivalent_usage == WACOM_HID_WD_TOUCHONOFF) *is_touch_on = value; input_report_switch(wacom_wac->shared->touch_input, SW_MUTE_DEVICE, !(*is_touch_on)); input_sync(wacom_wac->shared->touch_input); } return; } if (!input) return; switch (equivalent_usage) { case WACOM_HID_WD_TOUCHRING: /* * Userspace expects touchrings to increase in value with * clockwise gestures and have their zero point at the * tablet's left. HID events "should" be clockwise- * increasing and zero at top, though the MobileStudio * Pro and 2nd-gen Intuos Pro don't do this... */ if (hdev->vendor == 0x56a && (hdev->product == 0x34d || hdev->product == 0x34e || /* MobileStudio Pro */ hdev->product == 0x357 || hdev->product == 0x358 || /* Intuos Pro 2 */ hdev->product == 0x392 || /* Intuos Pro 2 */ hdev->product == 0x398 || hdev->product == 0x399 || /* MobileStudio Pro */ hdev->product == 0x3AA)) { /* MobileStudio Pro */ value = (field->logical_maximum - value); if (hdev->product == 0x357 || hdev->product == 0x358 || hdev->product == 0x392) value = wacom_offset_rotation(input, usage, value, 3, 16); else if (hdev->product == 0x34d || hdev->product == 0x34e || hdev->product == 0x398 || hdev->product == 0x399 || hdev->product == 0x3AA) value = wacom_offset_rotation(input, usage, value, 1, 2); } else if (field->flags & HID_MAIN_ITEM_RELATIVE) { int hires_value = value * 120 / usage->resolution_multiplier; int *ring_value; int lowres_code; if (usage->code == REL_WHEEL_HI_RES) { /* We must invert the sign for vertical * relative scrolling. Clockwise * rotation produces positive values * from HW, but userspace treats * positive REL_WHEEL as a scroll *up*! */ hires_value = -hires_value; ring_value = &wacom_wac->hid_data.ring_value; lowres_code = REL_WHEEL; } else if (usage->code == REL_HWHEEL_HI_RES) { /* No need to invert the sign for * horizontal relative scrolling. * Clockwise rotation produces positive * values from HW and userspace treats * positive REL_HWHEEL as a scroll * right. */ ring_value = &wacom_wac->hid_data.ring2_value; lowres_code = REL_HWHEEL; } else { hid_err(wacom->hdev, "unrecognized relative wheel with code %d\n", usage->code); break; } value = hires_value; *ring_value += hires_value; /* Emulate a legacy wheel click for every 120 * units of hi-res travel. */ if (*ring_value >= 120 || *ring_value <= -120) { int clicks = *ring_value / 120; input_event(input, usage->type, lowres_code, clicks); *ring_value -= clicks * 120; } } else { value = wacom_offset_rotation(input, usage, value, 1, 4); } do_report = true; break; case WACOM_HID_WD_TOUCHRINGSTATUS: if (!value) input_event(input, usage->type, usage->code, 0); break; case WACOM_HID_WD_MODE_CHANGE: if (wacom_wac->is_direct_mode != value) { wacom_wac->is_direct_mode = value; wacom_schedule_work(&wacom->wacom_wac, WACOM_WORKER_MODE_CHANGE); } break; case WACOM_HID_WD_BUTTONCENTER: for (i = 0; i < wacom->led.count; i++) wacom_update_led(wacom, features->numbered_buttons, value, i); fallthrough; default: do_report = true; break; } if (do_report) { input_event(input, usage->type, usage->code, value); if (value) wacom_wac->hid_data.pad_input_event_flag = true; } } static void wacom_wac_pad_pre_report(struct hid_device *hdev, struct hid_report *report) { struct wacom *wacom = hid_get_drvdata(hdev); struct wacom_wac *wacom_wac = &wacom->wacom_wac; wacom_wac->hid_data.inrange_state = 0; } static void wacom_wac_pad_report(struct hid_device *hdev, struct hid_report *report, struct hid_field *field) { struct wacom *wacom = hid_get_drvdata(hdev); struct wacom_wac *wacom_wac = &wacom->wacom_wac; struct input_dev *input = wacom_wac->pad_input; bool active = wacom_wac->hid_data.inrange_state != 0; /* report prox for expresskey events */ if (wacom_wac->hid_data.pad_input_event_flag) { input_event(input, EV_ABS, ABS_MISC, active ? PAD_DEVICE_ID : 0); input_sync(input); if (!active) wacom_wac->hid_data.pad_input_event_flag = false; } } static void wacom_set_barrel_switch3_usage(struct wacom_wac *wacom_wac) { struct input_dev *input = wacom_wac->pen_input; struct wacom_features *features = &wacom_wac->features; if (!(features->quirks & WACOM_QUIRK_AESPEN) && wacom_wac->hid_data.barrelswitch && wacom_wac->hid_data.barrelswitch2 && wacom_wac->hid_data.serialhi && !wacom_wac->hid_data.barrelswitch3) { input_set_capability(input, EV_KEY, BTN_STYLUS3); features->quirks |= WACOM_QUIRK_PEN_BUTTON3; } } static void wacom_wac_pen_usage_mapping(struct hid_device *hdev, struct hid_field *field, struct hid_usage *usage) { struct wacom *wacom = hid_get_drvdata(hdev); struct wacom_wac *wacom_wac = &wacom->wacom_wac; struct wacom_features *features = &wacom_wac->features; struct input_dev *input = wacom_wac->pen_input; unsigned equivalent_usage = wacom_equivalent_usage(usage->hid); switch (equivalent_usage) { case HID_GD_X: wacom_map_usage(input, usage, field, EV_ABS, ABS_X, 4); break; case HID_GD_Y: wacom_map_usage(input, usage, field, EV_ABS, ABS_Y, 4); break; case WACOM_HID_WD_DISTANCE: case HID_GD_Z: wacom_map_usage(input, usage, field, EV_ABS, ABS_DISTANCE, 0); break; case HID_DG_TIPPRESSURE: wacom_map_usage(input, usage, field, EV_ABS, ABS_PRESSURE, 0); break; case HID_DG_INRANGE: wacom_map_usage(input, usage, field, EV_KEY, BTN_TOOL_PEN, 0); break; case HID_DG_INVERT: wacom_map_usage(input, usage, field, EV_KEY, BTN_TOOL_RUBBER, 0); break; case HID_DG_TILT_X: wacom_map_usage(input, usage, field, EV_ABS, ABS_TILT_X, 0); break; case HID_DG_TILT_Y: wacom_map_usage(input, usage, field, EV_ABS, ABS_TILT_Y, 0); break; case HID_DG_TWIST: wacom_map_usage(input, usage, field, EV_ABS, ABS_Z, 0); break; case HID_DG_ERASER: input_set_capability(input, EV_KEY, BTN_TOOL_RUBBER); wacom_map_usage(input, usage, field, EV_KEY, BTN_TOUCH, 0); break; case HID_DG_TIPSWITCH: input_set_capability(input, EV_KEY, BTN_TOOL_PEN); wacom_map_usage(input, usage, field, EV_KEY, BTN_TOUCH, 0); break; case HID_DG_BARRELSWITCH: wacom_wac->hid_data.barrelswitch = true; wacom_set_barrel_switch3_usage(wacom_wac); wacom_map_usage(input, usage, field, EV_KEY, BTN_STYLUS, 0); break; case HID_DG_BARRELSWITCH2: wacom_wac->hid_data.barrelswitch2 = true; wacom_set_barrel_switch3_usage(wacom_wac); wacom_map_usage(input, usage, field, EV_KEY, BTN_STYLUS2, 0); break; case HID_DG_TOOLSERIALNUMBER: features->quirks |= WACOM_QUIRK_TOOLSERIAL; wacom_map_usage(input, usage, field, EV_MSC, MSC_SERIAL, 0); break; case HID_DG_SCANTIME: wacom_map_usage(input, usage, field, EV_MSC, MSC_TIMESTAMP, 0); break; case WACOM_HID_WD_SENSE: features->quirks |= WACOM_QUIRK_SENSE; wacom_map_usage(input, usage, field, EV_KEY, BTN_TOOL_PEN, 0); break; case WACOM_HID_WD_SERIALHI: wacom_wac->hid_data.serialhi = true; wacom_set_barrel_switch3_usage(wacom_wac); wacom_map_usage(input, usage, field, EV_ABS, ABS_MISC, 0); break; case WACOM_HID_WD_FINGERWHEEL: input_set_capability(input, EV_KEY, BTN_TOOL_AIRBRUSH); wacom_map_usage(input, usage, field, EV_ABS, ABS_WHEEL, 0); break; case WACOM_HID_WD_BARRELSWITCH3: wacom_wac->hid_data.barrelswitch3 = true; wacom_map_usage(input, usage, field, EV_KEY, BTN_STYLUS3, 0); features->quirks &= ~WACOM_QUIRK_PEN_BUTTON3; break; case WACOM_HID_WD_SEQUENCENUMBER: wacom_wac->hid_data.sequence_number = -1; break; } } static void wacom_wac_pen_event(struct hid_device *hdev, struct hid_field *field, struct hid_usage *usage, __s32 value) { struct wacom *wacom = hid_get_drvdata(hdev); struct wacom_wac *wacom_wac = &wacom->wacom_wac; struct wacom_features *features = &wacom_wac->features; struct input_dev *input = wacom_wac->pen_input; unsigned equivalent_usage = wacom_equivalent_usage(usage->hid); if (wacom_wac->is_invalid_bt_frame) return; switch (equivalent_usage) { case HID_GD_Z: /* * HID_GD_Z "should increase as the control's position is * moved from high to low", while ABS_DISTANCE instead * increases in value as the tool moves from low to high. */ value = field->logical_maximum - value; break; case HID_DG_INRANGE: mod_timer(&wacom->idleprox_timer, jiffies + msecs_to_jiffies(100)); wacom_wac->hid_data.inrange_state = value; if (!(features->quirks & WACOM_QUIRK_SENSE)) wacom_wac->hid_data.sense_state = value; return; case HID_DG_INVERT: wacom_wac->hid_data.eraser |= value; return; case HID_DG_ERASER: wacom_wac->hid_data.eraser |= value; fallthrough; case HID_DG_TIPSWITCH: wacom_wac->hid_data.tipswitch |= value; return; case HID_DG_BARRELSWITCH: wacom_wac->hid_data.barrelswitch = value; return; case HID_DG_BARRELSWITCH2: wacom_wac->hid_data.barrelswitch2 = value; return; case HID_DG_TOOLSERIALNUMBER: if (value) { wacom_wac->serial[0] = (wacom_wac->serial[0] & ~0xFFFFFFFFULL); wacom_wac->serial[0] |= wacom_s32tou(value, field->report_size); } return; case HID_DG_TWIST: /* don't modify the value if the pen doesn't support the feature */ if (!wacom_is_art_pen(wacom_wac->id[0])) return; /* * Userspace expects pen twist to have its zero point when * the buttons/finger is on the tablet's left. HID values * are zero when buttons are toward the top. */ value = wacom_offset_rotation(input, usage, value, 1, 4); break; case WACOM_HID_WD_SENSE: wacom_wac->hid_data.sense_state = value; return; case WACOM_HID_WD_SERIALHI: if (value) { __u32 raw_value = wacom_s32tou(value, field->report_size); wacom_wac->serial[0] = (wacom_wac->serial[0] & 0xFFFFFFFF); wacom_wac->serial[0] |= ((__u64)raw_value) << 32; /* * Non-USI EMR devices may contain additional tool type * information here. See WACOM_HID_WD_TOOLTYPE case for * more details. */ if (value >> 20 == 1) { wacom_wac->id[0] |= raw_value & 0xFFFFF; } } return; case WACOM_HID_WD_TOOLTYPE: /* * Some devices (MobileStudio Pro, and possibly later * devices as well) do not return the complete tool * type in their WACOM_HID_WD_TOOLTYPE usage. Use a * bitwise OR so the complete value can be built * up over time :( */ wacom_wac->id[0] |= wacom_s32tou(value, field->report_size); return; case WACOM_HID_WD_OFFSETLEFT: if (features->offset_left && value != features->offset_left) hid_warn(hdev, "%s: overriding existing left offset " "%d -> %d\n", __func__, value, features->offset_left); features->offset_left = value; return; case WACOM_HID_WD_OFFSETRIGHT: if (features->offset_right && value != features->offset_right) hid_warn(hdev, "%s: overriding existing right offset " "%d -> %d\n", __func__, value, features->offset_right); features->offset_right = value; return; case WACOM_HID_WD_OFFSETTOP: if (features->offset_top && value != features->offset_top) hid_warn(hdev, "%s: overriding existing top offset " "%d -> %d\n", __func__, value, features->offset_top); features->offset_top = value; return; case WACOM_HID_WD_OFFSETBOTTOM: if (features->offset_bottom && value != features->offset_bottom) hid_warn(hdev, "%s: overriding existing bottom offset " "%d -> %d\n", __func__, value, features->offset_bottom); features->offset_bottom = value; return; case WACOM_HID_WD_REPORT_VALID: wacom_wac->is_invalid_bt_frame = !value; return; case WACOM_HID_WD_BARRELSWITCH3: wacom_wac->hid_data.barrelswitch3 = value; return; case WACOM_HID_WD_SEQUENCENUMBER: if (wacom_wac->hid_data.sequence_number != value && wacom_wac->hid_data.sequence_number >= 0) { int sequence_size = field->logical_maximum - field->logical_minimum + 1; int drop_count = (value - wacom_wac->hid_data.sequence_number) % sequence_size; hid_warn(hdev, "Dropped %d packets", drop_count); } wacom_wac->hid_data.sequence_number = value + 1; if (wacom_wac->hid_data.sequence_number > field->logical_maximum) wacom_wac->hid_data.sequence_number = field->logical_minimum; return; } /* send pen events only when touch is up or forced out * or touch arbitration is off */ if (!usage->type || delay_pen_events(wacom_wac)) return; /* send pen events only when the pen is in range */ if (wacom_wac->hid_data.inrange_state) input_event(input, usage->type, usage->code, value); else if (wacom_wac->shared->stylus_in_proximity && !wacom_wac->hid_data.sense_state) input_event(input, usage->type, usage->code, 0); } static void wacom_wac_pen_pre_report(struct hid_device *hdev, struct hid_report *report) { struct wacom *wacom = hid_get_drvdata(hdev); struct wacom_wac *wacom_wac = &wacom->wacom_wac; wacom_wac->is_invalid_bt_frame = false; return; } static void wacom_wac_pen_report(struct hid_device *hdev, struct hid_report *report) { struct wacom *wacom = hid_get_drvdata(hdev); struct wacom_wac *wacom_wac = &wacom->wacom_wac; struct input_dev *input = wacom_wac->pen_input; bool range = wacom_wac->hid_data.inrange_state; bool sense = wacom_wac->hid_data.sense_state; bool entering_range = !wacom_wac->tool[0] && range; if (wacom_wac->is_invalid_bt_frame) return; if (entering_range) { /* first in range */ /* Going into range select tool */ if (wacom_wac->hid_data.eraser) wacom_wac->tool[0] = BTN_TOOL_RUBBER; else if (wacom_wac->features.quirks & WACOM_QUIRK_AESPEN) wacom_wac->tool[0] = BTN_TOOL_PEN; else if (wacom_wac->id[0]) wacom_wac->tool[0] = wacom_intuos_get_tool_type(wacom_wac->id[0]); else wacom_wac->tool[0] = BTN_TOOL_PEN; } /* keep pen state for touch events */ wacom_wac->shared->stylus_in_proximity = sense; if (!delay_pen_events(wacom_wac) && wacom_wac->tool[0]) { int id = wacom_wac->id[0]; if (wacom_wac->features.quirks & WACOM_QUIRK_PEN_BUTTON3) { int sw_state = wacom_wac->hid_data.barrelswitch | (wacom_wac->hid_data.barrelswitch2 << 1); wacom_wac->hid_data.barrelswitch = sw_state == 1; wacom_wac->hid_data.barrelswitch2 = sw_state == 2; wacom_wac->hid_data.barrelswitch3 = sw_state == 3; } input_report_key(input, BTN_STYLUS, wacom_wac->hid_data.barrelswitch); input_report_key(input, BTN_STYLUS2, wacom_wac->hid_data.barrelswitch2); input_report_key(input, BTN_STYLUS3, wacom_wac->hid_data.barrelswitch3); /* * Non-USI EMR tools should have their IDs mangled to * match the legacy behavior of wacom_intuos_general */ if (wacom_wac->serial[0] >> 52 == 1) id = wacom_intuos_id_mangle(id); /* * To ensure compatibility with xf86-input-wacom, we should * report the BTN_TOOL_* event prior to the ABS_MISC or * MSC_SERIAL events. */ input_report_key(input, BTN_TOUCH, wacom_wac->hid_data.tipswitch); input_report_key(input, wacom_wac->tool[0], sense); if (wacom_wac->serial[0]) { /* * xf86-input-wacom does not accept a serial number * of '0'. Report the low 32 bits if possible, but * if they are zero, report the upper ones instead. */ __u32 serial_lo = wacom_wac->serial[0] & 0xFFFFFFFFu; __u32 serial_hi = wacom_wac->serial[0] >> 32; input_event(input, EV_MSC, MSC_SERIAL, (int)(serial_lo ? serial_lo : serial_hi)); input_report_abs(input, ABS_MISC, sense ? id : 0); } wacom_wac->hid_data.tipswitch = false; wacom_wac->hid_data.eraser = false; input_sync(input); } /* Handle AES battery timeout behavior */ if (wacom_wac->features.quirks & WACOM_QUIRK_AESPEN) { if (entering_range) cancel_delayed_work(&wacom->aes_battery_work); if (!sense) schedule_delayed_work(&wacom->aes_battery_work, msecs_to_jiffies(WACOM_AES_BATTERY_TIMEOUT)); } if (!sense) { wacom_wac->tool[0] = 0; wacom_wac->id[0] = 0; wacom_wac->serial[0] = 0; } } static void wacom_wac_finger_usage_mapping(struct hid_device *hdev, struct hid_field *field, struct hid_usage *usage) { struct wacom *wacom = hid_get_drvdata(hdev); struct wacom_wac *wacom_wac = &wacom->wacom_wac; struct input_dev *input = wacom_wac->touch_input; unsigned touch_max = wacom_wac->features.touch_max; unsigned equivalent_usage = wacom_equivalent_usage(usage->hid); switch (equivalent_usage) { case HID_GD_X: if (touch_max == 1) wacom_map_usage(input, usage, field, EV_ABS, ABS_X, 4); else wacom_map_usage(input, usage, field, EV_ABS, ABS_MT_POSITION_X, 4); break; case HID_GD_Y: if (touch_max == 1) wacom_map_usage(input, usage, field, EV_ABS, ABS_Y, 4); else wacom_map_usage(input, usage, field, EV_ABS, ABS_MT_POSITION_Y, 4); break; case HID_DG_WIDTH: case HID_DG_HEIGHT: wacom_map_usage(input, usage, field, EV_ABS, ABS_MT_TOUCH_MAJOR, 0); wacom_map_usage(input, usage, field, EV_ABS, ABS_MT_TOUCH_MINOR, 0); input_set_abs_params(input, ABS_MT_ORIENTATION, 0, 1, 0, 0); break; case HID_DG_TIPSWITCH: wacom_map_usage(input, usage, field, EV_KEY, BTN_TOUCH, 0); break; case HID_DG_CONTACTCOUNT: wacom_wac->hid_data.cc_report = field->report->id; wacom_wac->hid_data.cc_index = field->index; wacom_wac->hid_data.cc_value_index = usage->usage_index; break; case HID_DG_CONTACTID: if ((field->logical_maximum - field->logical_minimum) < touch_max) { /* * The HID descriptor for G11 sensors leaves logical * maximum set to '1' despite it being a multitouch * device. Override to a sensible number. */ field->logical_maximum = 255; } break; case HID_DG_SCANTIME: wacom_map_usage(input, usage, field, EV_MSC, MSC_TIMESTAMP, 0); break; } } static void wacom_wac_finger_slot(struct wacom_wac *wacom_wac, struct input_dev *input) { struct hid_data *hid_data = &wacom_wac->hid_data; bool mt = wacom_wac->features.touch_max > 1; bool touch_down = hid_data->tipswitch && hid_data->confidence; bool prox = touch_down && report_touch_events(wacom_wac); if (touch_is_muted(wacom_wac)) { if (!wacom_wac->shared->touch_down) return; prox = false; } wacom_wac->hid_data.num_received++; if (wacom_wac->hid_data.num_received > wacom_wac->hid_data.num_expected) return; if (mt) { int slot; slot = input_mt_get_slot_by_key(input, hid_data->id); if (slot < 0) { return; } else { struct input_mt_slot *ps = &input->mt->slots[slot]; int mt_id = input_mt_get_value(ps, ABS_MT_TRACKING_ID); if (!prox && mt_id < 0) { // No data to send for this slot; short-circuit return; } } input_mt_slot(input, slot); input_mt_report_slot_state(input, MT_TOOL_FINGER, prox); } else { input_report_key(input, BTN_TOUCH, prox); } if (prox) { input_report_abs(input, mt ? ABS_MT_POSITION_X : ABS_X, hid_data->x); input_report_abs(input, mt ? ABS_MT_POSITION_Y : ABS_Y, hid_data->y); if (test_bit(ABS_MT_TOUCH_MAJOR, input->absbit)) { input_report_abs(input, ABS_MT_TOUCH_MAJOR, max(hid_data->width, hid_data->height)); input_report_abs(input, ABS_MT_TOUCH_MINOR, min(hid_data->width, hid_data->height)); if (hid_data->width != hid_data->height) input_report_abs(input, ABS_MT_ORIENTATION, hid_data->width <= hid_data->height ? 0 : 1); } } } static void wacom_wac_finger_event(struct hid_device *hdev, struct hid_field *field, struct hid_usage *usage, __s32 value) { struct wacom *wacom = hid_get_drvdata(hdev); struct wacom_wac *wacom_wac = &wacom->wacom_wac; unsigned equivalent_usage = wacom_equivalent_usage(usage->hid); struct wacom_features *features = &wacom->wacom_wac.features; if (touch_is_muted(wacom_wac) && !wacom_wac->shared->touch_down) return; if (wacom_wac->is_invalid_bt_frame) return; switch (equivalent_usage) { case HID_DG_CONFIDENCE: wacom_wac->hid_data.confidence = value; break; case HID_GD_X: wacom_wac->hid_data.x = value; break; case HID_GD_Y: wacom_wac->hid_data.y = value; break; case HID_DG_WIDTH: wacom_wac->hid_data.width = value; break; case HID_DG_HEIGHT: wacom_wac->hid_data.height = value; break; case HID_DG_CONTACTID: wacom_wac->hid_data.id = value; break; case HID_DG_TIPSWITCH: wacom_wac->hid_data.tipswitch = value; break; case WACOM_HID_WT_REPORT_VALID: wacom_wac->is_invalid_bt_frame = !value; return; case HID_DG_CONTACTMAX: if (!features->touch_max) { features->touch_max = value; } else { hid_warn(hdev, "%s: ignoring attempt to overwrite non-zero touch_max " "%d -> %d\n", __func__, features->touch_max, value); } return; } if (usage->usage_index + 1 == field->report_count) { if (equivalent_usage == wacom_wac->hid_data.last_slot_field) wacom_wac_finger_slot(wacom_wac, wacom_wac->touch_input); } } static void wacom_wac_finger_pre_report(struct hid_device *hdev, struct hid_report *report) { struct wacom *wacom = hid_get_drvdata(hdev); struct wacom_wac *wacom_wac = &wacom->wacom_wac; struct hid_data* hid_data = &wacom_wac->hid_data; int i; if (touch_is_muted(wacom_wac) && !wacom_wac->shared->touch_down) return; wacom_wac->is_invalid_bt_frame = false; hid_data->confidence = true; hid_data->cc_report = 0; hid_data->cc_index = -1; hid_data->cc_value_index = -1; for (i = 0; i < report->maxfield; i++) { struct hid_field *field = report->field[i]; int j; for (j = 0; j < field->maxusage; j++) { struct hid_usage *usage = &field->usage[j]; unsigned int equivalent_usage = wacom_equivalent_usage(usage->hid); switch (equivalent_usage) { case HID_GD_X: case HID_GD_Y: case HID_DG_WIDTH: case HID_DG_HEIGHT: case HID_DG_CONTACTID: case HID_DG_INRANGE: case HID_DG_INVERT: case HID_DG_TIPSWITCH: hid_data->last_slot_field = equivalent_usage; break; case HID_DG_CONTACTCOUNT: hid_data->cc_report = report->id; hid_data->cc_index = i; hid_data->cc_value_index = j; break; } } } if (hid_data->cc_report != 0 && hid_data->cc_index >= 0) { struct hid_field *field = report->field[hid_data->cc_index]; int value = field->value[hid_data->cc_value_index]; if (value) { hid_data->num_expected = value; hid_data->num_received = 0; } } else { hid_data->num_expected = wacom_wac->features.touch_max; hid_data->num_received = 0; } } static void wacom_wac_finger_report(struct hid_device *hdev, struct hid_report *report) { struct wacom *wacom = hid_get_drvdata(hdev); struct wacom_wac *wacom_wac = &wacom->wacom_wac; struct input_dev *input = wacom_wac->touch_input; unsigned touch_max = wacom_wac->features.touch_max; /* if there was nothing to process, don't send an empty sync */ if (wacom_wac->hid_data.num_expected == 0) return; /* If more packets of data are expected, give us a chance to * process them rather than immediately syncing a partial * update. */ if (wacom_wac->hid_data.num_received < wacom_wac->hid_data.num_expected) return; if (touch_max > 1) input_mt_sync_frame(input); input_sync(input); wacom_wac->hid_data.num_received = 0; wacom_wac->hid_data.num_expected = 0; /* keep touch state for pen event */ wacom_wac->shared->touch_down = wacom_wac_finger_count_touches(wacom_wac); } void wacom_wac_usage_mapping(struct hid_device *hdev, struct hid_field *field, struct hid_usage *usage) { struct wacom *wacom = hid_get_drvdata(hdev); struct wacom_wac *wacom_wac = &wacom->wacom_wac; struct wacom_features *features = &wacom_wac->features; if (WACOM_DIRECT_DEVICE(field)) features->device_type |= WACOM_DEVICETYPE_DIRECT; /* usage tests must precede field tests */ if (WACOM_BATTERY_USAGE(usage)) wacom_wac_battery_usage_mapping(hdev, field, usage); else if (WACOM_PAD_FIELD(field)) wacom_wac_pad_usage_mapping(hdev, field, usage); else if (WACOM_PEN_FIELD(field)) wacom_wac_pen_usage_mapping(hdev, field, usage); else if (WACOM_FINGER_FIELD(field)) wacom_wac_finger_usage_mapping(hdev, field, usage); } void wacom_wac_event(struct hid_device *hdev, struct hid_field *field, struct hid_usage *usage, __s32 value) { struct wacom *wacom = hid_get_drvdata(hdev); if (wacom->wacom_wac.features.type != HID_GENERIC) return; if (value > field->logical_maximum || value < field->logical_minimum) return; /* usage tests must precede field tests */ if (WACOM_BATTERY_USAGE(usage)) wacom_wac_battery_event(hdev, field, usage, value); else if (WACOM_PAD_FIELD(field)) wacom_wac_pad_event(hdev, field, usage, value); else if (WACOM_PEN_FIELD(field) && wacom->wacom_wac.pen_input) wacom_wac_pen_event(hdev, field, usage, value); else if (WACOM_FINGER_FIELD(field) && wacom->wacom_wac.touch_input) wacom_wac_finger_event(hdev, field, usage, value); } static void wacom_report_events(struct hid_device *hdev, struct hid_report *report, int collection_index, int field_index) { int r; for (r = field_index; r < report->maxfield; r++) { struct hid_field *field; unsigned count, n; field = report->field[r]; count = field->report_count; if (!(HID_MAIN_ITEM_VARIABLE & field->flags)) continue; for (n = 0 ; n < count; n++) { if (field->usage[n].collection_index == collection_index) wacom_wac_event(hdev, field, &field->usage[n], field->value[n]); else return; } } } static int wacom_wac_collection(struct hid_device *hdev, struct hid_report *report, int collection_index, struct hid_field *field, int field_index) { struct wacom *wacom = hid_get_drvdata(hdev); wacom_report_events(hdev, report, collection_index, field_index); /* * Non-input reports may be sent prior to the device being * completely initialized. Since only their events need * to be processed, exit after 'wacom_report_events' has * been called to prevent potential crashes in the report- * processing functions. */ if (report->type != HID_INPUT_REPORT) return -1; if (WACOM_PAD_FIELD(field)) return 0; else if (WACOM_PEN_FIELD(field) && wacom->wacom_wac.pen_input) wacom_wac_pen_report(hdev, report); else if (WACOM_FINGER_FIELD(field) && wacom->wacom_wac.touch_input) wacom_wac_finger_report(hdev, report); return 0; } void wacom_wac_report(struct hid_device *hdev, struct hid_report *report) { struct wacom *wacom = hid_get_drvdata(hdev); struct wacom_wac *wacom_wac = &wacom->wacom_wac; struct hid_field *field; bool pad_in_hid_field = false, pen_in_hid_field = false, finger_in_hid_field = false, true_pad = false; int r; int prev_collection = -1; if (wacom_wac->features.type != HID_GENERIC) return; for (r = 0; r < report->maxfield; r++) { field = report->field[r]; if (WACOM_PAD_FIELD(field)) pad_in_hid_field = true; if (WACOM_PEN_FIELD(field)) pen_in_hid_field = true; if (WACOM_FINGER_FIELD(field)) finger_in_hid_field = true; if (wacom_equivalent_usage(field->physical) == HID_DG_TABLETFUNCTIONKEY) true_pad = true; } wacom_wac_battery_pre_report(hdev, report); if (pad_in_hid_field && wacom_wac->pad_input) wacom_wac_pad_pre_report(hdev, report); if (pen_in_hid_field && wacom_wac->pen_input) wacom_wac_pen_pre_report(hdev, report); if (finger_in_hid_field && wacom_wac->touch_input) wacom_wac_finger_pre_report(hdev, report); for (r = 0; r < report->maxfield; r++) { field = report->field[r]; if (field->usage[0].collection_index != prev_collection) { if (wacom_wac_collection(hdev, report, field->usage[0].collection_index, field, r) < 0) return; prev_collection = field->usage[0].collection_index; } } wacom_wac_battery_report(hdev, report); if (true_pad && wacom_wac->pad_input) wacom_wac_pad_report(hdev, report, field); } static int wacom_bpt_touch(struct wacom_wac *wacom) { struct wacom_features *features = &wacom->features; struct input_dev *input = wacom->touch_input; struct input_dev *pad_input = wacom->pad_input; unsigned char *data = wacom->data; int i; if (data[0] != 0x02) return 0; for (i = 0; i < 2; i++) { int offset = (data[1] & 0x80) ? (8 * i) : (9 * i); bool touch = report_touch_events(wacom) && (data[offset + 3] & 0x80); input_mt_slot(input, i); input_mt_report_slot_state(input, MT_TOOL_FINGER, touch); if (touch) { int x = get_unaligned_be16(&data[offset + 3]) & 0x7ff; int y = get_unaligned_be16(&data[offset + 5]) & 0x7ff; if (features->quirks & WACOM_QUIRK_BBTOUCH_LOWRES) { x <<= 5; y <<= 5; } input_report_abs(input, ABS_MT_POSITION_X, x); input_report_abs(input, ABS_MT_POSITION_Y, y); } } input_mt_sync_frame(input); input_report_key(pad_input, BTN_LEFT, (data[1] & 0x08) != 0); input_report_key(pad_input, BTN_FORWARD, (data[1] & 0x04) != 0); input_report_key(pad_input, BTN_BACK, (data[1] & 0x02) != 0); input_report_key(pad_input, BTN_RIGHT, (data[1] & 0x01) != 0); wacom->shared->touch_down = wacom_wac_finger_count_touches(wacom); return 1; } static void wacom_bpt3_touch_msg(struct wacom_wac *wacom, unsigned char *data) { struct wacom_features *features = &wacom->features; struct input_dev *input = wacom->touch_input; bool touch = data[1] & 0x80; int slot = input_mt_get_slot_by_key(input, data[0]); if (slot < 0) return; touch = touch && report_touch_events(wacom); input_mt_slot(input, slot); input_mt_report_slot_state(input, MT_TOOL_FINGER, touch); if (touch) { int x = (data[2] << 4) | (data[4] >> 4); int y = (data[3] << 4) | (data[4] & 0x0f); int width, height; if (features->type >= INTUOSPS && features->type <= INTUOSHT2) { width = data[5] * 100; height = data[6] * 100; } else { /* * "a" is a scaled-down area which we assume is * roughly circular and which can be described as: * a=(pi*r^2)/C. */ int a = data[5]; int x_res = input_abs_get_res(input, ABS_MT_POSITION_X); int y_res = input_abs_get_res(input, ABS_MT_POSITION_Y); width = 2 * int_sqrt(a * WACOM_CONTACT_AREA_SCALE); height = width * y_res / x_res; } input_report_abs(input, ABS_MT_POSITION_X, x); input_report_abs(input, ABS_MT_POSITION_Y, y); input_report_abs(input, ABS_MT_TOUCH_MAJOR, width); input_report_abs(input, ABS_MT_TOUCH_MINOR, height); } } static void wacom_bpt3_button_msg(struct wacom_wac *wacom, unsigned char *data) { struct input_dev *input = wacom->pad_input; struct wacom_features *features = &wacom->features; if (features->type == INTUOSHT || features->type == INTUOSHT2) { input_report_key(input, BTN_LEFT, (data[1] & 0x02) != 0); input_report_key(input, BTN_BACK, (data[1] & 0x08) != 0); } else { input_report_key(input, BTN_BACK, (data[1] & 0x02) != 0); input_report_key(input, BTN_LEFT, (data[1] & 0x08) != 0); } input_report_key(input, BTN_FORWARD, (data[1] & 0x04) != 0); input_report_key(input, BTN_RIGHT, (data[1] & 0x01) != 0); } static int wacom_bpt3_touch(struct wacom_wac *wacom) { unsigned char *data = wacom->data; int count = data[1] & 0x07; int touch_changed = 0, i; if (data[0] != 0x02) return 0; /* data has up to 7 fixed sized 8-byte messages starting at data[2] */ for (i = 0; i < count; i++) { int offset = (8 * i) + 2; int msg_id = data[offset]; if (msg_id >= 2 && msg_id <= 17) { wacom_bpt3_touch_msg(wacom, data + offset); touch_changed++; } else if (msg_id == 128) wacom_bpt3_button_msg(wacom, data + offset); } /* only update touch if we actually have a touchpad and touch data changed */ if (wacom->touch_input && touch_changed) { input_mt_sync_frame(wacom->touch_input); wacom->shared->touch_down = wacom_wac_finger_count_touches(wacom); } return 1; } static int wacom_bpt_pen(struct wacom_wac *wacom) { struct wacom_features *features = &wacom->features; struct input_dev *input = wacom->pen_input; unsigned char *data = wacom->data; int x = 0, y = 0, p = 0, d = 0; bool pen = false, btn1 = false, btn2 = false; bool range, prox, rdy; if (data[0] != WACOM_REPORT_PENABLED) return 0; range = (data[1] & 0x80) == 0x80; prox = (data[1] & 0x40) == 0x40; rdy = (data[1] & 0x20) == 0x20; wacom->shared->stylus_in_proximity = range; if (delay_pen_events(wacom)) return 0; if (rdy) { p = le16_to_cpup((__le16 *)&data[6]); pen = data[1] & 0x01; btn1 = data[1] & 0x02; btn2 = data[1] & 0x04; } if (prox) { x = le16_to_cpup((__le16 *)&data[2]); y = le16_to_cpup((__le16 *)&data[4]); if (data[1] & 0x08) { wacom->tool[0] = BTN_TOOL_RUBBER; wacom->id[0] = ERASER_DEVICE_ID; } else { wacom->tool[0] = BTN_TOOL_PEN; wacom->id[0] = STYLUS_DEVICE_ID; } wacom->reporting_data = true; } if (range) { /* * Convert distance from out prox to distance from tablet. * distance will be greater than distance_max once * touching and applying pressure; do not report negative * distance. */ if (data[8] <= features->distance_max) d = features->distance_max - data[8]; } else { wacom->id[0] = 0; } if (wacom->reporting_data) { input_report_key(input, BTN_TOUCH, pen); input_report_key(input, BTN_STYLUS, btn1); input_report_key(input, BTN_STYLUS2, btn2); if (prox || !range) { input_report_abs(input, ABS_X, x); input_report_abs(input, ABS_Y, y); } input_report_abs(input, ABS_PRESSURE, p); input_report_abs(input, ABS_DISTANCE, d); input_report_key(input, wacom->tool[0], range); /* PEN or RUBBER */ input_report_abs(input, ABS_MISC, wacom->id[0]); /* TOOL ID */ } if (!range) { wacom->reporting_data = false; } return 1; } static int wacom_bpt_irq(struct wacom_wac *wacom, size_t len) { struct wacom_features *features = &wacom->features; if ((features->type == INTUOSHT2) && (features->device_type & WACOM_DEVICETYPE_PEN)) return wacom_intuos_irq(wacom); else if (len == WACOM_PKGLEN_BBTOUCH) return wacom_bpt_touch(wacom); else if (len == WACOM_PKGLEN_BBTOUCH3) return wacom_bpt3_touch(wacom); else if (len == WACOM_PKGLEN_BBFUN || len == WACOM_PKGLEN_BBPEN) return wacom_bpt_pen(wacom); return 0; } static void wacom_bamboo_pad_pen_event(struct wacom_wac *wacom, unsigned char *data) { unsigned char prefix; /* * We need to reroute the event from the debug interface to the * pen interface. * We need to add the report ID to the actual pen report, so we * temporary overwrite the first byte to prevent having to kzalloc/kfree * and memcpy the report. */ prefix = data[0]; data[0] = WACOM_REPORT_BPAD_PEN; /* * actually reroute the event. * No need to check if wacom->shared->pen is valid, hid_input_report() * will check for us. */ hid_input_report(wacom->shared->pen, HID_INPUT_REPORT, data, WACOM_PKGLEN_PENABLED, 1); data[0] = prefix; } static int wacom_bamboo_pad_touch_event(struct wacom_wac *wacom, unsigned char *data) { struct input_dev *input = wacom->touch_input; unsigned char *finger_data, prefix; unsigned id; int x, y; bool valid; prefix = data[0]; for (id = 0; id < wacom->features.touch_max; id++) { valid = !!(prefix & BIT(id)) && report_touch_events(wacom); input_mt_slot(input, id); input_mt_report_slot_state(input, MT_TOOL_FINGER, valid); if (!valid) continue; finger_data = data + 1 + id * 3; x = finger_data[0] | ((finger_data[1] & 0x0f) << 8); y = (finger_data[2] << 4) | (finger_data[1] >> 4); input_report_abs(input, ABS_MT_POSITION_X, x); input_report_abs(input, ABS_MT_POSITION_Y, y); } input_mt_sync_frame(input); input_report_key(input, BTN_LEFT, prefix & 0x40); input_report_key(input, BTN_RIGHT, prefix & 0x80); /* keep touch state for pen event */ wacom->shared->touch_down = !!prefix && report_touch_events(wacom); return 1; } static int wacom_bamboo_pad_irq(struct wacom_wac *wacom, size_t len) { unsigned char *data = wacom->data; if (!((len == WACOM_PKGLEN_BPAD_TOUCH) || (len == WACOM_PKGLEN_BPAD_TOUCH_USB)) || (data[0] != WACOM_REPORT_BPAD_TOUCH)) return 0; if (data[1] & 0x01) wacom_bamboo_pad_pen_event(wacom, &data[1]); if (data[1] & 0x02) return wacom_bamboo_pad_touch_event(wacom, &data[9]); return 0; } static int wacom_wireless_irq(struct wacom_wac *wacom, size_t len) { unsigned char *data = wacom->data; int connected; if (len != WACOM_PKGLEN_WIRELESS || data[0] != WACOM_REPORT_WL) return 0; connected = data[1] & 0x01; if (connected) { int pid, battery, charging; if ((wacom->shared->type == INTUOSHT || wacom->shared->type == INTUOSHT2) && wacom->shared->touch_input && wacom->shared->touch_max) { input_report_switch(wacom->shared->touch_input, SW_MUTE_DEVICE, data[5] & 0x40); input_sync(wacom->shared->touch_input); } pid = get_unaligned_be16(&data[6]); battery = (data[5] & 0x3f) * 100 / 31; charging = !!(data[5] & 0x80); if (wacom->pid != pid) { wacom->pid = pid; wacom_schedule_work(wacom, WACOM_WORKER_WIRELESS); } wacom_notify_battery(wacom, WACOM_POWER_SUPPLY_STATUS_AUTO, battery, charging, 1, 0); } else if (wacom->pid != 0) { /* disconnected while previously connected */ wacom->pid = 0; wacom_schedule_work(wacom, WACOM_WORKER_WIRELESS); wacom_notify_battery(wacom, POWER_SUPPLY_STATUS_UNKNOWN, 0, 0, 0, 0); } return 0; } static int wacom_status_irq(struct wacom_wac *wacom_wac, size_t len) { struct wacom *wacom = container_of(wacom_wac, struct wacom, wacom_wac); struct wacom_features *features = &wacom_wac->features; unsigned char *data = wacom_wac->data; if (data[0] != WACOM_REPORT_USB) return 0; if ((features->type == INTUOSHT || features->type == INTUOSHT2) && wacom_wac->shared->touch_input && features->touch_max) { input_report_switch(wacom_wac->shared->touch_input, SW_MUTE_DEVICE, data[8] & 0x40); input_sync(wacom_wac->shared->touch_input); } if (data[9] & 0x02) { /* wireless module is attached */ int battery = (data[8] & 0x3f) * 100 / 31; bool charging = !!(data[8] & 0x80); features->quirks |= WACOM_QUIRK_BATTERY; wacom_notify_battery(wacom_wac, WACOM_POWER_SUPPLY_STATUS_AUTO, battery, charging, battery || charging, 1); } else if ((features->quirks & WACOM_QUIRK_BATTERY) && wacom->battery.battery) { features->quirks &= ~WACOM_QUIRK_BATTERY; wacom_notify_battery(wacom_wac, POWER_SUPPLY_STATUS_UNKNOWN, 0, 0, 0, 0); } return 0; } void wacom_wac_irq(struct wacom_wac *wacom_wac, size_t len) { bool sync; switch (wacom_wac->features.type) { case PENPARTNER: sync = wacom_penpartner_irq(wacom_wac); break; case PL: sync = wacom_pl_irq(wacom_wac); break; case WACOM_G4: case GRAPHIRE: case GRAPHIRE_BT: case WACOM_MO: sync = wacom_graphire_irq(wacom_wac); break; case PTU: sync = wacom_ptu_irq(wacom_wac); break; case DTU: sync = wacom_dtu_irq(wacom_wac); break; case DTUS: case DTUSX: sync = wacom_dtus_irq(wacom_wac); break; case INTUOS: case INTUOS3S: case INTUOS3: case INTUOS3L: case INTUOS4S: case INTUOS4: case INTUOS4L: case CINTIQ: case WACOM_BEE: case WACOM_13HD: case WACOM_21UX2: case WACOM_22HD: case WACOM_24HD: case WACOM_27QHD: case DTK: case CINTIQ_HYBRID: case CINTIQ_COMPANION_2: sync = wacom_intuos_irq(wacom_wac); break; case INTUOS4WL: sync = wacom_intuos_bt_irq(wacom_wac, len); break; case WACOM_24HDT: case WACOM_27QHDT: sync = wacom_24hdt_irq(wacom_wac); break; case INTUOS5S: case INTUOS5: case INTUOS5L: case INTUOSPS: case INTUOSPM: case INTUOSPL: if (len == WACOM_PKGLEN_BBTOUCH3) sync = wacom_bpt3_touch(wacom_wac); else if (wacom_wac->data[0] == WACOM_REPORT_USB) sync = wacom_status_irq(wacom_wac, len); else sync = wacom_intuos_irq(wacom_wac); break; case INTUOSP2_BT: case INTUOSP2S_BT: case INTUOSHT3_BT: sync = wacom_intuos_pro2_bt_irq(wacom_wac, len); break; case TABLETPC: case TABLETPCE: case TABLETPC2FG: case MTSCREEN: case MTTPC: case MTTPC_B: sync = wacom_tpc_irq(wacom_wac, len); break; case BAMBOO_PT: case BAMBOO_PEN: case BAMBOO_TOUCH: case INTUOSHT: case INTUOSHT2: if (wacom_wac->data[0] == WACOM_REPORT_USB) sync = wacom_status_irq(wacom_wac, len); else sync = wacom_bpt_irq(wacom_wac, len); break; case BAMBOO_PAD: sync = wacom_bamboo_pad_irq(wacom_wac, len); break; case WIRELESS: sync = wacom_wireless_irq(wacom_wac, len); break; case REMOTE: sync = false; if (wacom_wac->data[0] == WACOM_REPORT_DEVICE_LIST) wacom_remote_status_irq(wacom_wac, len); else sync = wacom_remote_irq(wacom_wac, len); break; default: sync = false; break; } if (sync) { if (wacom_wac->pen_input) input_sync(wacom_wac->pen_input); if (wacom_wac->touch_input) input_sync(wacom_wac->touch_input); if (wacom_wac->pad_input) input_sync(wacom_wac->pad_input); } } static void wacom_setup_basic_pro_pen(struct wacom_wac *wacom_wac) { struct input_dev *input_dev = wacom_wac->pen_input; input_set_capability(input_dev, EV_MSC, MSC_SERIAL); __set_bit(BTN_TOOL_PEN, input_dev->keybit); __set_bit(BTN_STYLUS, input_dev->keybit); __set_bit(BTN_STYLUS2, input_dev->keybit); input_set_abs_params(input_dev, ABS_DISTANCE, 0, wacom_wac->features.distance_max, wacom_wac->features.distance_fuzz, 0); } static void wacom_setup_cintiq(struct wacom_wac *wacom_wac) { struct input_dev *input_dev = wacom_wac->pen_input; struct wacom_features *features = &wacom_wac->features; wacom_setup_basic_pro_pen(wacom_wac); __set_bit(BTN_TOOL_RUBBER, input_dev->keybit); __set_bit(BTN_TOOL_BRUSH, input_dev->keybit); __set_bit(BTN_TOOL_PENCIL, input_dev->keybit); __set_bit(BTN_TOOL_AIRBRUSH, input_dev->keybit); input_set_abs_params(input_dev, ABS_WHEEL, 0, 1023, 0, 0); input_set_abs_params(input_dev, ABS_TILT_X, -64, 63, features->tilt_fuzz, 0); input_abs_set_res(input_dev, ABS_TILT_X, 57); input_set_abs_params(input_dev, ABS_TILT_Y, -64, 63, features->tilt_fuzz, 0); input_abs_set_res(input_dev, ABS_TILT_Y, 57); } static void wacom_setup_intuos(struct wacom_wac *wacom_wac) { struct input_dev *input_dev = wacom_wac->pen_input; input_set_capability(input_dev, EV_REL, REL_WHEEL); wacom_setup_cintiq(wacom_wac); __set_bit(BTN_LEFT, input_dev->keybit); __set_bit(BTN_RIGHT, input_dev->keybit); __set_bit(BTN_MIDDLE, input_dev->keybit); __set_bit(BTN_SIDE, input_dev->keybit); __set_bit(BTN_EXTRA, input_dev->keybit); __set_bit(BTN_TOOL_MOUSE, input_dev->keybit); __set_bit(BTN_TOOL_LENS, input_dev->keybit); input_set_abs_params(input_dev, ABS_RZ, -900, 899, 0, 0); input_abs_set_res(input_dev, ABS_RZ, 287); input_set_abs_params(input_dev, ABS_THROTTLE, -1023, 1023, 0, 0); } void wacom_setup_device_quirks(struct wacom *wacom) { struct wacom_wac *wacom_wac = &wacom->wacom_wac; struct wacom_features *features = &wacom->wacom_wac.features; /* The pen and pad share the same interface on most devices */ if (features->type == GRAPHIRE_BT || features->type == WACOM_G4 || features->type == DTUS || (features->type >= INTUOS3S && features->type <= WACOM_MO)) { if (features->device_type & WACOM_DEVICETYPE_PEN) features->device_type |= WACOM_DEVICETYPE_PAD; } /* touch device found but size is not defined. use default */ if (features->device_type & WACOM_DEVICETYPE_TOUCH && !features->x_max) { features->x_max = 1023; features->y_max = 1023; } /* * Intuos5/Pro and Bamboo 3rd gen have no useful data about its * touch interface in its HID descriptor. If this is the touch * interface (PacketSize of WACOM_PKGLEN_BBTOUCH3), override the * tablet values. */ if ((features->type >= INTUOS5S && features->type <= INTUOSPL) || (features->type >= INTUOSHT && features->type <= BAMBOO_PT)) { if (features->pktlen == WACOM_PKGLEN_BBTOUCH3) { if (features->touch_max) features->device_type |= WACOM_DEVICETYPE_TOUCH; if (features->type >= INTUOSHT && features->type <= BAMBOO_PT) features->device_type |= WACOM_DEVICETYPE_PAD; if (features->type == INTUOSHT2) { features->x_max = features->x_max / 10; features->y_max = features->y_max / 10; } else { features->x_max = 4096; features->y_max = 4096; } } else if (features->pktlen == WACOM_PKGLEN_BBTOUCH) { features->device_type |= WACOM_DEVICETYPE_PAD; } } /* * Hack for the Bamboo One: * the device presents a PAD/Touch interface as most Bamboos and even * sends ghosts PAD data on it. However, later, we must disable this * ghost interface, and we can not detect it unless we set it here * to WACOM_DEVICETYPE_PAD or WACOM_DEVICETYPE_TOUCH. */ if (features->type == BAMBOO_PEN && features->pktlen == WACOM_PKGLEN_BBTOUCH3) features->device_type |= WACOM_DEVICETYPE_PAD; /* * Raw Wacom-mode pen and touch events both come from interface * 0, whose HID descriptor has an application usage of 0xFF0D * (i.e., WACOM_HID_WD_DIGITIZER). We route pen packets back * out through the HID_GENERIC device created for interface 1, * so rewrite this one to be of type WACOM_DEVICETYPE_TOUCH. */ if (features->type == BAMBOO_PAD) features->device_type = WACOM_DEVICETYPE_TOUCH; if (features->type == REMOTE) features->device_type = WACOM_DEVICETYPE_PAD; if (features->type == INTUOSP2_BT || features->type == INTUOSP2S_BT) { features->device_type |= WACOM_DEVICETYPE_PEN | WACOM_DEVICETYPE_PAD | WACOM_DEVICETYPE_TOUCH; features->quirks |= WACOM_QUIRK_BATTERY; } if (features->type == INTUOSHT3_BT) { features->device_type |= WACOM_DEVICETYPE_PEN | WACOM_DEVICETYPE_PAD; features->quirks |= WACOM_QUIRK_BATTERY; } switch (features->type) { case PL: case DTU: case DTUS: case DTUSX: case WACOM_21UX2: case WACOM_22HD: case DTK: case WACOM_24HD: case WACOM_27QHD: case CINTIQ_HYBRID: case CINTIQ_COMPANION_2: case CINTIQ: case WACOM_BEE: case WACOM_13HD: case WACOM_24HDT: case WACOM_27QHDT: case TABLETPC: case TABLETPCE: case TABLETPC2FG: case MTSCREEN: case MTTPC: case MTTPC_B: features->device_type |= WACOM_DEVICETYPE_DIRECT; break; } if (wacom->hdev->bus == BUS_BLUETOOTH) features->quirks |= WACOM_QUIRK_BATTERY; /* quirk for bamboo touch with 2 low res touches */ if ((features->type == BAMBOO_PT || features->type == BAMBOO_TOUCH) && features->pktlen == WACOM_PKGLEN_BBTOUCH) { features->x_max <<= 5; features->y_max <<= 5; features->x_fuzz <<= 5; features->y_fuzz <<= 5; features->quirks |= WACOM_QUIRK_BBTOUCH_LOWRES; } if (features->type == WIRELESS) { if (features->device_type == WACOM_DEVICETYPE_WL_MONITOR) { features->quirks |= WACOM_QUIRK_BATTERY; } } if (features->type == REMOTE) features->device_type |= WACOM_DEVICETYPE_WL_MONITOR; /* HID descriptor for DTK-2451 / DTH-2452 claims to report lots * of things it shouldn't. Lets fix up the damage... */ if (wacom->hdev->product == 0x382 || wacom->hdev->product == 0x37d) { features->quirks &= ~WACOM_QUIRK_TOOLSERIAL; __clear_bit(BTN_TOOL_BRUSH, wacom_wac->pen_input->keybit); __clear_bit(BTN_TOOL_PENCIL, wacom_wac->pen_input->keybit); __clear_bit(BTN_TOOL_AIRBRUSH, wacom_wac->pen_input->keybit); __clear_bit(ABS_Z, wacom_wac->pen_input->absbit); __clear_bit(ABS_DISTANCE, wacom_wac->pen_input->absbit); __clear_bit(ABS_TILT_X, wacom_wac->pen_input->absbit); __clear_bit(ABS_TILT_Y, wacom_wac->pen_input->absbit); __clear_bit(ABS_WHEEL, wacom_wac->pen_input->absbit); __clear_bit(ABS_MISC, wacom_wac->pen_input->absbit); __clear_bit(MSC_SERIAL, wacom_wac->pen_input->mscbit); __clear_bit(EV_MSC, wacom_wac->pen_input->evbit); } } int wacom_setup_pen_input_capabilities(struct input_dev *input_dev, struct wacom_wac *wacom_wac) { struct wacom_features *features = &wacom_wac->features; if (!(features->device_type & WACOM_DEVICETYPE_PEN)) return -ENODEV; if (features->device_type & WACOM_DEVICETYPE_DIRECT) __set_bit(INPUT_PROP_DIRECT, input_dev->propbit); else __set_bit(INPUT_PROP_POINTER, input_dev->propbit); if (features->type == HID_GENERIC) /* setup has already been done */ return 0; input_dev->evbit[0] |= BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); __set_bit(BTN_TOUCH, input_dev->keybit); __set_bit(ABS_MISC, input_dev->absbit); input_set_abs_params(input_dev, ABS_X, 0 + features->offset_left, features->x_max - features->offset_right, features->x_fuzz, 0); input_set_abs_params(input_dev, ABS_Y, 0 + features->offset_top, features->y_max - features->offset_bottom, features->y_fuzz, 0); input_set_abs_params(input_dev, ABS_PRESSURE, 0, features->pressure_max, features->pressure_fuzz, 0); /* penabled devices have fixed resolution for each model */ input_abs_set_res(input_dev, ABS_X, features->x_resolution); input_abs_set_res(input_dev, ABS_Y, features->y_resolution); switch (features->type) { case GRAPHIRE_BT: __clear_bit(ABS_MISC, input_dev->absbit); fallthrough; case WACOM_MO: case WACOM_G4: input_set_abs_params(input_dev, ABS_DISTANCE, 0, features->distance_max, features->distance_fuzz, 0); fallthrough; case GRAPHIRE: input_set_capability(input_dev, EV_REL, REL_WHEEL); __set_bit(BTN_LEFT, input_dev->keybit); __set_bit(BTN_RIGHT, input_dev->keybit); __set_bit(BTN_MIDDLE, input_dev->keybit); __set_bit(BTN_TOOL_RUBBER, input_dev->keybit); __set_bit(BTN_TOOL_PEN, input_dev->keybit); __set_bit(BTN_TOOL_MOUSE, input_dev->keybit); __set_bit(BTN_STYLUS, input_dev->keybit); __set_bit(BTN_STYLUS2, input_dev->keybit); break; case WACOM_27QHD: case WACOM_24HD: case DTK: case WACOM_22HD: case WACOM_21UX2: case WACOM_BEE: case CINTIQ: case WACOM_13HD: case CINTIQ_HYBRID: case CINTIQ_COMPANION_2: input_set_abs_params(input_dev, ABS_Z, -900, 899, 0, 0); input_abs_set_res(input_dev, ABS_Z, 287); wacom_setup_cintiq(wacom_wac); break; case INTUOS3: case INTUOS3L: case INTUOS3S: case INTUOS4: case INTUOS4WL: case INTUOS4L: case INTUOS4S: input_set_abs_params(input_dev, ABS_Z, -900, 899, 0, 0); input_abs_set_res(input_dev, ABS_Z, 287); fallthrough; case INTUOS: wacom_setup_intuos(wacom_wac); break; case INTUOS5: case INTUOS5L: case INTUOSPM: case INTUOSPL: case INTUOS5S: case INTUOSPS: case INTUOSP2_BT: case INTUOSP2S_BT: input_set_abs_params(input_dev, ABS_DISTANCE, 0, features->distance_max, features->distance_fuzz, 0); input_set_abs_params(input_dev, ABS_Z, -900, 899, 0, 0); input_abs_set_res(input_dev, ABS_Z, 287); wacom_setup_intuos(wacom_wac); break; case WACOM_24HDT: case WACOM_27QHDT: case MTSCREEN: case MTTPC: case MTTPC_B: case TABLETPC2FG: case TABLETPC: case TABLETPCE: __clear_bit(ABS_MISC, input_dev->absbit); fallthrough; case DTUS: case DTUSX: case PL: case DTU: __set_bit(BTN_TOOL_PEN, input_dev->keybit); __set_bit(BTN_TOOL_RUBBER, input_dev->keybit); __set_bit(BTN_STYLUS, input_dev->keybit); __set_bit(BTN_STYLUS2, input_dev->keybit); break; case PTU: __set_bit(BTN_STYLUS2, input_dev->keybit); fallthrough; case PENPARTNER: __set_bit(BTN_TOOL_PEN, input_dev->keybit); __set_bit(BTN_TOOL_RUBBER, input_dev->keybit); __set_bit(BTN_STYLUS, input_dev->keybit); break; case INTUOSHT: case BAMBOO_PT: case BAMBOO_PEN: case INTUOSHT2: case INTUOSHT3_BT: if (features->type == INTUOSHT2 || features->type == INTUOSHT3_BT) { wacom_setup_basic_pro_pen(wacom_wac); } else { __clear_bit(ABS_MISC, input_dev->absbit); __set_bit(BTN_TOOL_PEN, input_dev->keybit); __set_bit(BTN_TOOL_RUBBER, input_dev->keybit); __set_bit(BTN_STYLUS, input_dev->keybit); __set_bit(BTN_STYLUS2, input_dev->keybit); input_set_abs_params(input_dev, ABS_DISTANCE, 0, features->distance_max, features->distance_fuzz, 0); } break; case BAMBOO_PAD: __clear_bit(ABS_MISC, input_dev->absbit); break; } return 0; } int wacom_setup_touch_input_capabilities(struct input_dev *input_dev, struct wacom_wac *wacom_wac) { struct wacom_features *features = &wacom_wac->features; if (!(features->device_type & WACOM_DEVICETYPE_TOUCH)) return -ENODEV; if (features->device_type & WACOM_DEVICETYPE_DIRECT) __set_bit(INPUT_PROP_DIRECT, input_dev->propbit); else __set_bit(INPUT_PROP_POINTER, input_dev->propbit); if (features->type == HID_GENERIC) /* setup has already been done */ return 0; input_dev->evbit[0] |= BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); __set_bit(BTN_TOUCH, input_dev->keybit); if (features->touch_max == 1) { input_set_abs_params(input_dev, ABS_X, 0, features->x_max, features->x_fuzz, 0); input_set_abs_params(input_dev, ABS_Y, 0, features->y_max, features->y_fuzz, 0); input_abs_set_res(input_dev, ABS_X, features->x_resolution); input_abs_set_res(input_dev, ABS_Y, features->y_resolution); } else if (features->touch_max > 1) { input_set_abs_params(input_dev, ABS_MT_POSITION_X, 0, features->x_max, features->x_fuzz, 0); input_set_abs_params(input_dev, ABS_MT_POSITION_Y, 0, features->y_max, features->y_fuzz, 0); input_abs_set_res(input_dev, ABS_MT_POSITION_X, features->x_resolution); input_abs_set_res(input_dev, ABS_MT_POSITION_Y, features->y_resolution); } switch (features->type) { case INTUOSP2_BT: case INTUOSP2S_BT: input_dev->evbit[0] |= BIT_MASK(EV_SW); __set_bit(SW_MUTE_DEVICE, input_dev->swbit); if (wacom_wac->shared->touch->product == 0x361) { input_set_abs_params(input_dev, ABS_MT_POSITION_X, 0, 12440, 4, 0); input_set_abs_params(input_dev, ABS_MT_POSITION_Y, 0, 8640, 4, 0); } else if (wacom_wac->shared->touch->product == 0x360) { input_set_abs_params(input_dev, ABS_MT_POSITION_X, 0, 8960, 4, 0); input_set_abs_params(input_dev, ABS_MT_POSITION_Y, 0, 5920, 4, 0); } else if (wacom_wac->shared->touch->product == 0x393) { input_set_abs_params(input_dev, ABS_MT_POSITION_X, 0, 6400, 4, 0); input_set_abs_params(input_dev, ABS_MT_POSITION_Y, 0, 4000, 4, 0); } input_abs_set_res(input_dev, ABS_MT_POSITION_X, 40); input_abs_set_res(input_dev, ABS_MT_POSITION_Y, 40); fallthrough; case INTUOS5: case INTUOS5L: case INTUOSPM: case INTUOSPL: case INTUOS5S: case INTUOSPS: input_set_abs_params(input_dev, ABS_MT_TOUCH_MAJOR, 0, features->x_max, 0, 0); input_set_abs_params(input_dev, ABS_MT_TOUCH_MINOR, 0, features->y_max, 0, 0); input_mt_init_slots(input_dev, features->touch_max, INPUT_MT_POINTER); break; case WACOM_24HDT: input_set_abs_params(input_dev, ABS_MT_TOUCH_MAJOR, 0, features->x_max, 0, 0); input_set_abs_params(input_dev, ABS_MT_WIDTH_MAJOR, 0, features->x_max, 0, 0); input_set_abs_params(input_dev, ABS_MT_WIDTH_MINOR, 0, features->y_max, 0, 0); input_set_abs_params(input_dev, ABS_MT_ORIENTATION, 0, 1, 0, 0); fallthrough; case WACOM_27QHDT: if (wacom_wac->shared->touch->product == 0x32C || wacom_wac->shared->touch->product == 0xF6) { input_dev->evbit[0] |= BIT_MASK(EV_SW); __set_bit(SW_MUTE_DEVICE, input_dev->swbit); wacom_wac->has_mute_touch_switch = true; wacom_wac->is_soft_touch_switch = true; } fallthrough; case MTSCREEN: case MTTPC: case MTTPC_B: case TABLETPC2FG: input_mt_init_slots(input_dev, features->touch_max, INPUT_MT_DIRECT); fallthrough; case TABLETPC: case TABLETPCE: break; case INTUOSHT: case INTUOSHT2: input_dev->evbit[0] |= BIT_MASK(EV_SW); __set_bit(SW_MUTE_DEVICE, input_dev->swbit); fallthrough; case BAMBOO_PT: case BAMBOO_TOUCH: if (features->pktlen == WACOM_PKGLEN_BBTOUCH3) { input_set_abs_params(input_dev, ABS_MT_TOUCH_MAJOR, 0, features->x_max, 0, 0); input_set_abs_params(input_dev, ABS_MT_TOUCH_MINOR, 0, features->y_max, 0, 0); } input_mt_init_slots(input_dev, features->touch_max, INPUT_MT_POINTER); break; case BAMBOO_PAD: input_mt_init_slots(input_dev, features->touch_max, INPUT_MT_POINTER); __set_bit(BTN_LEFT, input_dev->keybit); __set_bit(BTN_RIGHT, input_dev->keybit); break; } return 0; } static int wacom_numbered_button_to_key(int n) { if (n < 10) return BTN_0 + n; else if (n < 16) return BTN_A + (n-10); else if (n < 18) return BTN_BASE + (n-16); else return 0; } static void wacom_setup_numbered_buttons(struct input_dev *input_dev, int button_count) { int i; for (i = 0; i < button_count; i++) { int key = wacom_numbered_button_to_key(i); if (key) __set_bit(key, input_dev->keybit); } } static void wacom_24hd_update_leds(struct wacom *wacom, int mask, int group) { struct wacom_led *led; int i; bool updated = false; /* * 24HD has LED group 1 to the left and LED group 0 to the right. * So group 0 matches the second half of the buttons and thus the mask * needs to be shifted. */ if (group == 0) mask >>= 8; for (i = 0; i < 3; i++) { led = wacom_led_find(wacom, group, i); if (!led) { hid_err(wacom->hdev, "can't find LED %d in group %d\n", i, group); continue; } if (!updated && mask & BIT(i)) { led->held = true; led_trigger_event(&led->trigger, LED_FULL); } else { led->held = false; } } } static bool wacom_is_led_toggled(struct wacom *wacom, int button_count, int mask, int group) { int group_button; /* * 21UX2 has LED group 1 to the left and LED group 0 * to the right. We need to reverse the group to match this * historical behavior. */ if (wacom->wacom_wac.features.type == WACOM_21UX2) group = 1 - group; group_button = group * (button_count/wacom->led.count); if (wacom->wacom_wac.features.type == INTUOSP2_BT) group_button = 8; return mask & (1 << group_button); } static void wacom_update_led(struct wacom *wacom, int button_count, int mask, int group) { struct wacom_led *led, *next_led; int cur; bool pressed; if (wacom->wacom_wac.features.type == WACOM_24HD) return wacom_24hd_update_leds(wacom, mask, group); pressed = wacom_is_led_toggled(wacom, button_count, mask, group); cur = wacom->led.groups[group].select; led = wacom_led_find(wacom, group, cur); if (!led) { hid_err(wacom->hdev, "can't find current LED %d in group %d\n", cur, group); return; } if (!pressed) { led->held = false; return; } if (led->held && pressed) return; next_led = wacom_led_next(wacom, led); if (!next_led) { hid_err(wacom->hdev, "can't find next LED in group %d\n", group); return; } if (next_led == led) return; next_led->held = true; led_trigger_event(&next_led->trigger, wacom_leds_brightness_get(next_led)); } static void wacom_report_numbered_buttons(struct input_dev *input_dev, int button_count, int mask) { struct wacom *wacom = input_get_drvdata(input_dev); int i; for (i = 0; i < wacom->led.count; i++) wacom_update_led(wacom, button_count, mask, i); for (i = 0; i < button_count; i++) { int key = wacom_numbered_button_to_key(i); if (key) input_report_key(input_dev, key, mask & (1 << i)); } } int wacom_setup_pad_input_capabilities(struct input_dev *input_dev, struct wacom_wac *wacom_wac) { struct wacom_features *features = &wacom_wac->features; if ((features->type == HID_GENERIC) && features->numbered_buttons > 0) features->device_type |= WACOM_DEVICETYPE_PAD; if (!(features->device_type & WACOM_DEVICETYPE_PAD)) return -ENODEV; if (features->type == REMOTE && input_dev == wacom_wac->pad_input) return -ENODEV; input_dev->evbit[0] |= BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); /* kept for making legacy xf86-input-wacom working with the wheels */ __set_bit(ABS_MISC, input_dev->absbit); /* kept for making legacy xf86-input-wacom accepting the pad */ if (!(input_dev->absinfo && (input_dev->absinfo[ABS_X].minimum || input_dev->absinfo[ABS_X].maximum))) input_set_abs_params(input_dev, ABS_X, 0, 1, 0, 0); if (!(input_dev->absinfo && (input_dev->absinfo[ABS_Y].minimum || input_dev->absinfo[ABS_Y].maximum))) input_set_abs_params(input_dev, ABS_Y, 0, 1, 0, 0); /* kept for making udev and libwacom accepting the pad */ __set_bit(BTN_STYLUS, input_dev->keybit); wacom_setup_numbered_buttons(input_dev, features->numbered_buttons); switch (features->type) { case CINTIQ_HYBRID: case CINTIQ_COMPANION_2: case DTK: case DTUS: case GRAPHIRE_BT: break; case WACOM_MO: __set_bit(BTN_BACK, input_dev->keybit); __set_bit(BTN_LEFT, input_dev->keybit); __set_bit(BTN_FORWARD, input_dev->keybit); __set_bit(BTN_RIGHT, input_dev->keybit); input_set_abs_params(input_dev, ABS_WHEEL, 0, 71, 0, 0); break; case WACOM_G4: __set_bit(BTN_BACK, input_dev->keybit); __set_bit(BTN_FORWARD, input_dev->keybit); input_set_capability(input_dev, EV_REL, REL_WHEEL); break; case WACOM_24HD: __set_bit(KEY_PROG1, input_dev->keybit); __set_bit(KEY_PROG2, input_dev->keybit); __set_bit(KEY_PROG3, input_dev->keybit); __set_bit(KEY_ONSCREEN_KEYBOARD, input_dev->keybit); __set_bit(KEY_INFO, input_dev->keybit); if (!features->oPid) __set_bit(KEY_BUTTONCONFIG, input_dev->keybit); input_set_abs_params(input_dev, ABS_WHEEL, 0, 71, 0, 0); input_set_abs_params(input_dev, ABS_THROTTLE, 0, 71, 0, 0); break; case WACOM_27QHD: __set_bit(KEY_PROG1, input_dev->keybit); __set_bit(KEY_PROG2, input_dev->keybit); __set_bit(KEY_PROG3, input_dev->keybit); __set_bit(KEY_ONSCREEN_KEYBOARD, input_dev->keybit); __set_bit(KEY_BUTTONCONFIG, input_dev->keybit); if (!features->oPid) __set_bit(KEY_CONTROLPANEL, input_dev->keybit); input_set_abs_params(input_dev, ABS_X, -2048, 2048, 0, 0); input_abs_set_res(input_dev, ABS_X, 1024); /* points/g */ input_set_abs_params(input_dev, ABS_Y, -2048, 2048, 0, 0); input_abs_set_res(input_dev, ABS_Y, 1024); input_set_abs_params(input_dev, ABS_Z, -2048, 2048, 0, 0); input_abs_set_res(input_dev, ABS_Z, 1024); __set_bit(INPUT_PROP_ACCELEROMETER, input_dev->propbit); break; case WACOM_22HD: __set_bit(KEY_PROG1, input_dev->keybit); __set_bit(KEY_PROG2, input_dev->keybit); __set_bit(KEY_PROG3, input_dev->keybit); __set_bit(KEY_BUTTONCONFIG, input_dev->keybit); __set_bit(KEY_INFO, input_dev->keybit); fallthrough; case WACOM_21UX2: case WACOM_BEE: case CINTIQ: input_set_abs_params(input_dev, ABS_RX, 0, 4096, 0, 0); input_set_abs_params(input_dev, ABS_RY, 0, 4096, 0, 0); break; case WACOM_13HD: input_set_abs_params(input_dev, ABS_WHEEL, 0, 71, 0, 0); break; case INTUOS3: case INTUOS3L: input_set_abs_params(input_dev, ABS_RY, 0, 4096, 0, 0); fallthrough; case INTUOS3S: input_set_abs_params(input_dev, ABS_RX, 0, 4096, 0, 0); break; case INTUOS5: case INTUOS5L: case INTUOSPM: case INTUOSPL: case INTUOS5S: case INTUOSPS: case INTUOSP2_BT: case INTUOSP2S_BT: input_set_abs_params(input_dev, ABS_WHEEL, 0, 71, 0, 0); break; case INTUOS4WL: /* * For Bluetooth devices, the udev rule does not work correctly * for pads unless we add a stylus capability, which forces * ID_INPUT_TABLET to be set. */ __set_bit(BTN_STYLUS, input_dev->keybit); fallthrough; case INTUOS4: case INTUOS4L: case INTUOS4S: input_set_abs_params(input_dev, ABS_WHEEL, 0, 71, 0, 0); break; case INTUOSHT: case BAMBOO_PT: case BAMBOO_TOUCH: case INTUOSHT2: __clear_bit(ABS_MISC, input_dev->absbit); __set_bit(BTN_LEFT, input_dev->keybit); __set_bit(BTN_FORWARD, input_dev->keybit); __set_bit(BTN_BACK, input_dev->keybit); __set_bit(BTN_RIGHT, input_dev->keybit); break; case REMOTE: input_set_capability(input_dev, EV_MSC, MSC_SERIAL); input_set_abs_params(input_dev, ABS_WHEEL, 0, 71, 0, 0); break; case INTUOSHT3_BT: case HID_GENERIC: break; default: /* no pad supported */ return -ENODEV; } return 0; } static const struct wacom_features wacom_features_0x00 = { "Wacom Penpartner", 5040, 3780, 255, 0, PENPARTNER, WACOM_PENPRTN_RES, WACOM_PENPRTN_RES }; static const struct wacom_features wacom_features_0x10 = { "Wacom Graphire", 10206, 7422, 511, 63, GRAPHIRE, WACOM_GRAPHIRE_RES, WACOM_GRAPHIRE_RES }; static const struct wacom_features wacom_features_0x81 = { "Wacom Graphire BT", 16704, 12064, 511, 32, GRAPHIRE_BT, WACOM_GRAPHIRE_RES, WACOM_GRAPHIRE_RES, 2 }; static const struct wacom_features wacom_features_0x11 = { "Wacom Graphire2 4x5", 10206, 7422, 511, 63, GRAPHIRE, WACOM_GRAPHIRE_RES, WACOM_GRAPHIRE_RES }; static const struct wacom_features wacom_features_0x12 = { "Wacom Graphire2 5x7", 13918, 10206, 511, 63, GRAPHIRE, WACOM_GRAPHIRE_RES, WACOM_GRAPHIRE_RES }; static const struct wacom_features wacom_features_0x13 = { "Wacom Graphire3", 10208, 7424, 511, 63, GRAPHIRE, WACOM_GRAPHIRE_RES, WACOM_GRAPHIRE_RES }; static const struct wacom_features wacom_features_0x14 = { "Wacom Graphire3 6x8", 16704, 12064, 511, 63, GRAPHIRE, WACOM_GRAPHIRE_RES, WACOM_GRAPHIRE_RES }; static const struct wacom_features wacom_features_0x15 = { "Wacom Graphire4 4x5", 10208, 7424, 511, 63, WACOM_G4, WACOM_GRAPHIRE_RES, WACOM_GRAPHIRE_RES }; static const struct wacom_features wacom_features_0x16 = { "Wacom Graphire4 6x8", 16704, 12064, 511, 63, WACOM_G4, WACOM_GRAPHIRE_RES, WACOM_GRAPHIRE_RES }; static const struct wacom_features wacom_features_0x17 = { "Wacom BambooFun 4x5", 14760, 9225, 511, 63, WACOM_MO, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0x18 = { "Wacom BambooFun 6x8", 21648, 13530, 511, 63, WACOM_MO, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0x19 = { "Wacom Bamboo1 Medium", 16704, 12064, 511, 63, GRAPHIRE, WACOM_GRAPHIRE_RES, WACOM_GRAPHIRE_RES }; static const struct wacom_features wacom_features_0x60 = { "Wacom Volito", 5104, 3712, 511, 63, GRAPHIRE, WACOM_VOLITO_RES, WACOM_VOLITO_RES }; static const struct wacom_features wacom_features_0x61 = { "Wacom PenStation2", 3250, 2320, 255, 63, GRAPHIRE, WACOM_VOLITO_RES, WACOM_VOLITO_RES }; static const struct wacom_features wacom_features_0x62 = { "Wacom Volito2 4x5", 5104, 3712, 511, 63, GRAPHIRE, WACOM_VOLITO_RES, WACOM_VOLITO_RES }; static const struct wacom_features wacom_features_0x63 = { "Wacom Volito2 2x3", 3248, 2320, 511, 63, GRAPHIRE, WACOM_VOLITO_RES, WACOM_VOLITO_RES }; static const struct wacom_features wacom_features_0x64 = { "Wacom PenPartner2", 3250, 2320, 511, 63, GRAPHIRE, WACOM_VOLITO_RES, WACOM_VOLITO_RES }; static const struct wacom_features wacom_features_0x65 = { "Wacom Bamboo", 14760, 9225, 511, 63, WACOM_MO, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0x69 = { "Wacom Bamboo1", 5104, 3712, 511, 63, GRAPHIRE, WACOM_PENPRTN_RES, WACOM_PENPRTN_RES }; static const struct wacom_features wacom_features_0x6A = { "Wacom Bamboo1 4x6", 14760, 9225, 1023, 63, GRAPHIRE, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0x6B = { "Wacom Bamboo1 5x8", 21648, 13530, 1023, 63, GRAPHIRE, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0x20 = { "Wacom Intuos 4x5", 12700, 10600, 1023, 31, INTUOS, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0x21 = { "Wacom Intuos 6x8", 20320, 16240, 1023, 31, INTUOS, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0x22 = { "Wacom Intuos 9x12", 30480, 24060, 1023, 31, INTUOS, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0x23 = { "Wacom Intuos 12x12", 30480, 31680, 1023, 31, INTUOS, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0x24 = { "Wacom Intuos 12x18", 45720, 31680, 1023, 31, INTUOS, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0x30 = { "Wacom PL400", 5408, 4056, 255, 0, PL, WACOM_PL_RES, WACOM_PL_RES }; static const struct wacom_features wacom_features_0x31 = { "Wacom PL500", 6144, 4608, 255, 0, PL, WACOM_PL_RES, WACOM_PL_RES }; static const struct wacom_features wacom_features_0x32 = { "Wacom PL600", 6126, 4604, 255, 0, PL, WACOM_PL_RES, WACOM_PL_RES }; static const struct wacom_features wacom_features_0x33 = { "Wacom PL600SX", 6260, 5016, 255, 0, PL, WACOM_PL_RES, WACOM_PL_RES }; static const struct wacom_features wacom_features_0x34 = { "Wacom PL550", 6144, 4608, 511, 0, PL, WACOM_PL_RES, WACOM_PL_RES }; static const struct wacom_features wacom_features_0x35 = { "Wacom PL800", 7220, 5780, 511, 0, PL, WACOM_PL_RES, WACOM_PL_RES }; static const struct wacom_features wacom_features_0x37 = { "Wacom PL700", 6758, 5406, 511, 0, PL, WACOM_PL_RES, WACOM_PL_RES }; static const struct wacom_features wacom_features_0x38 = { "Wacom PL510", 6282, 4762, 511, 0, PL, WACOM_PL_RES, WACOM_PL_RES }; static const struct wacom_features wacom_features_0x39 = { "Wacom DTU710", 34080, 27660, 511, 0, PL, WACOM_PL_RES, WACOM_PL_RES }; static const struct wacom_features wacom_features_0xC4 = { "Wacom DTF521", 6282, 4762, 511, 0, PL, WACOM_PL_RES, WACOM_PL_RES }; static const struct wacom_features wacom_features_0xC0 = { "Wacom DTF720", 6858, 5506, 511, 0, PL, WACOM_PL_RES, WACOM_PL_RES }; static const struct wacom_features wacom_features_0xC2 = { "Wacom DTF720a", 6858, 5506, 511, 0, PL, WACOM_PL_RES, WACOM_PL_RES }; static const struct wacom_features wacom_features_0x03 = { "Wacom Cintiq Partner", 20480, 15360, 511, 0, PTU, WACOM_PL_RES, WACOM_PL_RES }; static const struct wacom_features wacom_features_0x41 = { "Wacom Intuos2 4x5", 12700, 10600, 1023, 31, INTUOS, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0x42 = { "Wacom Intuos2 6x8", 20320, 16240, 1023, 31, INTUOS, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0x43 = { "Wacom Intuos2 9x12", 30480, 24060, 1023, 31, INTUOS, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0x44 = { "Wacom Intuos2 12x12", 30480, 31680, 1023, 31, INTUOS, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0x45 = { "Wacom Intuos2 12x18", 45720, 31680, 1023, 31, INTUOS, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0xB0 = { "Wacom Intuos3 4x5", 25400, 20320, 1023, 63, INTUOS3S, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 4 }; static const struct wacom_features wacom_features_0xB1 = { "Wacom Intuos3 6x8", 40640, 30480, 1023, 63, INTUOS3, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 8 }; static const struct wacom_features wacom_features_0xB2 = { "Wacom Intuos3 9x12", 60960, 45720, 1023, 63, INTUOS3, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 8 }; static const struct wacom_features wacom_features_0xB3 = { "Wacom Intuos3 12x12", 60960, 60960, 1023, 63, INTUOS3L, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 8 }; static const struct wacom_features wacom_features_0xB4 = { "Wacom Intuos3 12x19", 97536, 60960, 1023, 63, INTUOS3L, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 8 }; static const struct wacom_features wacom_features_0xB5 = { "Wacom Intuos3 6x11", 54204, 31750, 1023, 63, INTUOS3, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 8 }; static const struct wacom_features wacom_features_0xB7 = { "Wacom Intuos3 4x6", 31496, 19685, 1023, 63, INTUOS3S, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 4 }; static const struct wacom_features wacom_features_0xB8 = { "Wacom Intuos4 4x6", 31496, 19685, 2047, 63, INTUOS4S, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 7 }; static const struct wacom_features wacom_features_0xB9 = { "Wacom Intuos4 6x9", 44704, 27940, 2047, 63, INTUOS4, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 9 }; static const struct wacom_features wacom_features_0xBA = { "Wacom Intuos4 8x13", 65024, 40640, 2047, 63, INTUOS4L, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 9 }; static const struct wacom_features wacom_features_0xBB = { "Wacom Intuos4 12x19", 97536, 60960, 2047, 63, INTUOS4L, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 9 }; static const struct wacom_features wacom_features_0xBC = { "Wacom Intuos4 WL", 40640, 25400, 2047, 63, INTUOS4, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 9 }; static const struct wacom_features wacom_features_0xBD = { "Wacom Intuos4 WL", 40640, 25400, 2047, 63, INTUOS4WL, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 9 }; static const struct wacom_features wacom_features_0x26 = { "Wacom Intuos5 touch S", 31496, 19685, 2047, 63, INTUOS5S, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 7, .touch_max = 16 }; static const struct wacom_features wacom_features_0x27 = { "Wacom Intuos5 touch M", 44704, 27940, 2047, 63, INTUOS5, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 9, .touch_max = 16 }; static const struct wacom_features wacom_features_0x28 = { "Wacom Intuos5 touch L", 65024, 40640, 2047, 63, INTUOS5L, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 9, .touch_max = 16 }; static const struct wacom_features wacom_features_0x29 = { "Wacom Intuos5 S", 31496, 19685, 2047, 63, INTUOS5S, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 7 }; static const struct wacom_features wacom_features_0x2A = { "Wacom Intuos5 M", 44704, 27940, 2047, 63, INTUOS5, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 9 }; static const struct wacom_features wacom_features_0x314 = { "Wacom Intuos Pro S", 31496, 19685, 2047, 63, INTUOSPS, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 7, .touch_max = 16, .check_for_hid_type = true, .hid_type = HID_TYPE_USBNONE }; static const struct wacom_features wacom_features_0x315 = { "Wacom Intuos Pro M", 44704, 27940, 2047, 63, INTUOSPM, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 9, .touch_max = 16, .check_for_hid_type = true, .hid_type = HID_TYPE_USBNONE }; static const struct wacom_features wacom_features_0x317 = { "Wacom Intuos Pro L", 65024, 40640, 2047, 63, INTUOSPL, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 9, .touch_max = 16, .check_for_hid_type = true, .hid_type = HID_TYPE_USBNONE }; static const struct wacom_features wacom_features_0xF4 = { "Wacom Cintiq 24HD", 104480, 65600, 2047, 63, WACOM_24HD, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 16, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET }; static const struct wacom_features wacom_features_0xF8 = { "Wacom Cintiq 24HD touch", 104480, 65600, 2047, 63, /* Pen */ WACOM_24HD, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 16, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, .oVid = USB_VENDOR_ID_WACOM, .oPid = 0xf6 }; static const struct wacom_features wacom_features_0xF6 = { "Wacom Cintiq 24HD touch", .type = WACOM_24HDT, /* Touch */ .oVid = USB_VENDOR_ID_WACOM, .oPid = 0xf8, .touch_max = 10, .check_for_hid_type = true, .hid_type = HID_TYPE_USBNONE }; static const struct wacom_features wacom_features_0x32A = { "Wacom Cintiq 27QHD", 120140, 67920, 2047, 63, WACOM_27QHD, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 0, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET }; static const struct wacom_features wacom_features_0x32B = { "Wacom Cintiq 27QHD touch", 120140, 67920, 2047, 63, WACOM_27QHD, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 0, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, .oVid = USB_VENDOR_ID_WACOM, .oPid = 0x32C }; static const struct wacom_features wacom_features_0x32C = { "Wacom Cintiq 27QHD touch", .type = WACOM_27QHDT, .oVid = USB_VENDOR_ID_WACOM, .oPid = 0x32B, .touch_max = 10 }; static const struct wacom_features wacom_features_0x3F = { "Wacom Cintiq 21UX", 87200, 65600, 1023, 63, CINTIQ, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 8 }; static const struct wacom_features wacom_features_0xC5 = { "Wacom Cintiq 20WSX", 86680, 54180, 1023, 63, WACOM_BEE, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 10 }; static const struct wacom_features wacom_features_0xC6 = { "Wacom Cintiq 12WX", 53020, 33440, 1023, 63, WACOM_BEE, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 10 }; static const struct wacom_features wacom_features_0x304 = { "Wacom Cintiq 13HD", 59552, 33848, 1023, 63, WACOM_13HD, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 9, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET }; static const struct wacom_features wacom_features_0x333 = { "Wacom Cintiq 13HD touch", 59552, 33848, 2047, 63, WACOM_13HD, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 9, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, .oVid = USB_VENDOR_ID_WACOM, .oPid = 0x335 }; static const struct wacom_features wacom_features_0x335 = { "Wacom Cintiq 13HD touch", .type = WACOM_24HDT, /* Touch */ .oVid = USB_VENDOR_ID_WACOM, .oPid = 0x333, .touch_max = 10, .check_for_hid_type = true, .hid_type = HID_TYPE_USBNONE }; static const struct wacom_features wacom_features_0xC7 = { "Wacom DTU1931", 37832, 30305, 511, 0, PL, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0xCE = { "Wacom DTU2231", 47864, 27011, 511, 0, DTU, WACOM_INTUOS_RES, WACOM_INTUOS_RES, .check_for_hid_type = true, .hid_type = HID_TYPE_USBMOUSE }; static const struct wacom_features wacom_features_0xF0 = { "Wacom DTU1631", 34623, 19553, 511, 0, DTU, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0xFB = { "Wacom DTU1031", 22096, 13960, 511, 0, DTUS, WACOM_INTUOS_RES, WACOM_INTUOS_RES, 4, WACOM_DTU_OFFSET, WACOM_DTU_OFFSET, WACOM_DTU_OFFSET, WACOM_DTU_OFFSET }; static const struct wacom_features wacom_features_0x32F = { "Wacom DTU1031X", 22672, 12928, 511, 0, DTUSX, WACOM_INTUOS_RES, WACOM_INTUOS_RES, 0, WACOM_DTU_OFFSET, WACOM_DTU_OFFSET, WACOM_DTU_OFFSET, WACOM_DTU_OFFSET }; static const struct wacom_features wacom_features_0x336 = { "Wacom DTU1141", 23672, 13403, 1023, 0, DTUS, WACOM_INTUOS_RES, WACOM_INTUOS_RES, 4, WACOM_DTU_OFFSET, WACOM_DTU_OFFSET, WACOM_DTU_OFFSET, WACOM_DTU_OFFSET }; static const struct wacom_features wacom_features_0x57 = { "Wacom DTK2241", 95840, 54260, 2047, 63, DTK, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 6, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET }; static const struct wacom_features wacom_features_0x59 = /* Pen */ { "Wacom DTH2242", 95840, 54260, 2047, 63, DTK, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 6, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, .oVid = USB_VENDOR_ID_WACOM, .oPid = 0x5D }; static const struct wacom_features wacom_features_0x5D = /* Touch */ { "Wacom DTH2242", .type = WACOM_24HDT, .oVid = USB_VENDOR_ID_WACOM, .oPid = 0x59, .touch_max = 10, .check_for_hid_type = true, .hid_type = HID_TYPE_USBNONE }; static const struct wacom_features wacom_features_0xCC = { "Wacom Cintiq 21UX2", 87200, 65600, 2047, 63, WACOM_21UX2, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 18, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET }; static const struct wacom_features wacom_features_0xFA = { "Wacom Cintiq 22HD", 95840, 54260, 2047, 63, WACOM_22HD, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 18, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET }; static const struct wacom_features wacom_features_0x5B = { "Wacom Cintiq 22HDT", 95840, 54260, 2047, 63, WACOM_22HD, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 18, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, .oVid = USB_VENDOR_ID_WACOM, .oPid = 0x5e }; static const struct wacom_features wacom_features_0x5E = { "Wacom Cintiq 22HDT", .type = WACOM_24HDT, .oVid = USB_VENDOR_ID_WACOM, .oPid = 0x5b, .touch_max = 10, .check_for_hid_type = true, .hid_type = HID_TYPE_USBNONE }; static const struct wacom_features wacom_features_0x90 = { "Wacom ISDv4 90", 26202, 16325, 255, 0, TABLETPC, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; /* Pen-only */ static const struct wacom_features wacom_features_0x93 = { "Wacom ISDv4 93", 26202, 16325, 255, 0, TABLETPC, WACOM_INTUOS_RES, WACOM_INTUOS_RES, .touch_max = 1 }; static const struct wacom_features wacom_features_0x97 = { "Wacom ISDv4 97", 26202, 16325, 511, 0, TABLETPC, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; /* Pen-only */ static const struct wacom_features wacom_features_0x9A = { "Wacom ISDv4 9A", 26202, 16325, 255, 0, TABLETPC, WACOM_INTUOS_RES, WACOM_INTUOS_RES, .touch_max = 1 }; static const struct wacom_features wacom_features_0x9F = { "Wacom ISDv4 9F", 26202, 16325, 255, 0, TABLETPC, WACOM_INTUOS_RES, WACOM_INTUOS_RES, .touch_max = 1 }; static const struct wacom_features wacom_features_0xE2 = { "Wacom ISDv4 E2", 26202, 16325, 255, 0, TABLETPC2FG, WACOM_INTUOS_RES, WACOM_INTUOS_RES, .touch_max = 2 }; static const struct wacom_features wacom_features_0xE3 = { "Wacom ISDv4 E3", 26202, 16325, 255, 0, TABLETPC2FG, WACOM_INTUOS_RES, WACOM_INTUOS_RES, .touch_max = 2 }; static const struct wacom_features wacom_features_0xE5 = { "Wacom ISDv4 E5", 26202, 16325, 255, 0, MTSCREEN, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0xE6 = { "Wacom ISDv4 E6", 27760, 15694, 255, 0, TABLETPC2FG, WACOM_INTUOS_RES, WACOM_INTUOS_RES, .touch_max = 2 }; static const struct wacom_features wacom_features_0xEC = { "Wacom ISDv4 EC", 25710, 14500, 255, 0, TABLETPC, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; /* Pen-only */ static const struct wacom_features wacom_features_0xED = { "Wacom ISDv4 ED", 26202, 16325, 255, 0, TABLETPCE, WACOM_INTUOS_RES, WACOM_INTUOS_RES, .touch_max = 1 }; static const struct wacom_features wacom_features_0xEF = { "Wacom ISDv4 EF", 26202, 16325, 255, 0, TABLETPC, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; /* Pen-only */ static const struct wacom_features wacom_features_0x100 = { "Wacom ISDv4 100", 26202, 16325, 255, 0, MTTPC, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0x101 = { "Wacom ISDv4 101", 26202, 16325, 255, 0, MTTPC, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0x10D = { "Wacom ISDv4 10D", 26202, 16325, 255, 0, MTTPC, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0x10E = { "Wacom ISDv4 10E", 27760, 15694, 255, 0, MTTPC, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0x10F = { "Wacom ISDv4 10F", 27760, 15694, 255, 0, MTTPC, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0x116 = { "Wacom ISDv4 116", 26202, 16325, 255, 0, TABLETPCE, WACOM_INTUOS_RES, WACOM_INTUOS_RES, .touch_max = 1 }; static const struct wacom_features wacom_features_0x12C = { "Wacom ISDv4 12C", 27848, 15752, 2047, 0, TABLETPC, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; /* Pen-only */ static const struct wacom_features wacom_features_0x4001 = { "Wacom ISDv4 4001", 26202, 16325, 255, 0, MTTPC, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0x4004 = { "Wacom ISDv4 4004", 11060, 6220, 255, 0, MTTPC_B, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0x5000 = { "Wacom ISDv4 5000", 27848, 15752, 1023, 0, MTTPC_B, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0x5002 = { "Wacom ISDv4 5002", 29576, 16724, 1023, 0, MTTPC_B, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0x47 = { "Wacom Intuos2 6x8", 20320, 16240, 1023, 31, INTUOS, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0x84 = { "Wacom Wireless Receiver", .type = WIRELESS, .touch_max = 16 }; static const struct wacom_features wacom_features_0xD0 = { "Wacom Bamboo 2FG", 14720, 9200, 1023, 31, BAMBOO_TOUCH, WACOM_INTUOS_RES, WACOM_INTUOS_RES, .touch_max = 2 }; static const struct wacom_features wacom_features_0xD1 = { "Wacom Bamboo 2FG 4x5", 14720, 9200, 1023, 31, BAMBOO_PT, WACOM_INTUOS_RES, WACOM_INTUOS_RES, .touch_max = 2 }; static const struct wacom_features wacom_features_0xD2 = { "Wacom Bamboo Craft", 14720, 9200, 1023, 31, BAMBOO_PT, WACOM_INTUOS_RES, WACOM_INTUOS_RES, .touch_max = 2 }; static const struct wacom_features wacom_features_0xD3 = { "Wacom Bamboo 2FG 6x8", 21648, 13700, 1023, 31, BAMBOO_PT, WACOM_INTUOS_RES, WACOM_INTUOS_RES, .touch_max = 2 }; static const struct wacom_features wacom_features_0xD4 = { "Wacom Bamboo Pen", 14720, 9200, 1023, 31, BAMBOO_PEN, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0xD5 = { "Wacom Bamboo Pen 6x8", 21648, 13700, 1023, 31, BAMBOO_PEN, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0xD6 = { "Wacom BambooPT 2FG 4x5", 14720, 9200, 1023, 31, BAMBOO_PT, WACOM_INTUOS_RES, WACOM_INTUOS_RES, .touch_max = 2 }; static const struct wacom_features wacom_features_0xD7 = { "Wacom BambooPT 2FG Small", 14720, 9200, 1023, 31, BAMBOO_PT, WACOM_INTUOS_RES, WACOM_INTUOS_RES, .touch_max = 2 }; static const struct wacom_features wacom_features_0xD8 = { "Wacom Bamboo Comic 2FG", 21648, 13700, 1023, 31, BAMBOO_PT, WACOM_INTUOS_RES, WACOM_INTUOS_RES, .touch_max = 2 }; static const struct wacom_features wacom_features_0xDA = { "Wacom Bamboo 2FG 4x5 SE", 14720, 9200, 1023, 31, BAMBOO_PT, WACOM_INTUOS_RES, WACOM_INTUOS_RES, .touch_max = 2 }; static const struct wacom_features wacom_features_0xDB = { "Wacom Bamboo 2FG 6x8 SE", 21648, 13700, 1023, 31, BAMBOO_PT, WACOM_INTUOS_RES, WACOM_INTUOS_RES, .touch_max = 2 }; static const struct wacom_features wacom_features_0xDD = { "Wacom Bamboo Connect", 14720, 9200, 1023, 31, BAMBOO_PT, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0xDE = { "Wacom Bamboo 16FG 4x5", 14720, 9200, 1023, 31, BAMBOO_PT, WACOM_INTUOS_RES, WACOM_INTUOS_RES, .touch_max = 16 }; static const struct wacom_features wacom_features_0xDF = { "Wacom Bamboo 16FG 6x8", 21648, 13700, 1023, 31, BAMBOO_PT, WACOM_INTUOS_RES, WACOM_INTUOS_RES, .touch_max = 16 }; static const struct wacom_features wacom_features_0x300 = { "Wacom Bamboo One S", 14720, 9225, 1023, 31, BAMBOO_PEN, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0x301 = { "Wacom Bamboo One M", 21648, 13530, 1023, 31, BAMBOO_PEN, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0x302 = { "Wacom Intuos PT S", 15200, 9500, 1023, 31, INTUOSHT, WACOM_INTUOS_RES, WACOM_INTUOS_RES, .touch_max = 16, .check_for_hid_type = true, .hid_type = HID_TYPE_USBNONE }; static const struct wacom_features wacom_features_0x303 = { "Wacom Intuos PT M", 21600, 13500, 1023, 31, INTUOSHT, WACOM_INTUOS_RES, WACOM_INTUOS_RES, .touch_max = 16, .check_for_hid_type = true, .hid_type = HID_TYPE_USBNONE }; static const struct wacom_features wacom_features_0x30E = { "Wacom Intuos S", 15200, 9500, 1023, 31, INTUOSHT, WACOM_INTUOS_RES, WACOM_INTUOS_RES, .check_for_hid_type = true, .hid_type = HID_TYPE_USBNONE }; static const struct wacom_features wacom_features_0x6004 = { "ISD-V4", 12800, 8000, 255, 0, TABLETPC, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0x307 = { "Wacom ISDv5 307", 59552, 33848, 2047, 63, CINTIQ_HYBRID, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 9, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, .oVid = USB_VENDOR_ID_WACOM, .oPid = 0x309 }; static const struct wacom_features wacom_features_0x309 = { "Wacom ISDv5 309", .type = WACOM_24HDT, /* Touch */ .oVid = USB_VENDOR_ID_WACOM, .oPid = 0x0307, .touch_max = 10, .check_for_hid_type = true, .hid_type = HID_TYPE_USBNONE }; static const struct wacom_features wacom_features_0x30A = { "Wacom ISDv5 30A", 59552, 33848, 2047, 63, CINTIQ_HYBRID, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 9, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, .oVid = USB_VENDOR_ID_WACOM, .oPid = 0x30C }; static const struct wacom_features wacom_features_0x30C = { "Wacom ISDv5 30C", .type = WACOM_24HDT, /* Touch */ .oVid = USB_VENDOR_ID_WACOM, .oPid = 0x30A, .touch_max = 10, .check_for_hid_type = true, .hid_type = HID_TYPE_USBNONE }; static const struct wacom_features wacom_features_0x318 = { "Wacom USB Bamboo PAD", 4095, 4095, /* Touch */ .type = BAMBOO_PAD, 35, 48, .touch_max = 4 }; static const struct wacom_features wacom_features_0x319 = { "Wacom Wireless Bamboo PAD", 4095, 4095, /* Touch */ .type = BAMBOO_PAD, 35, 48, .touch_max = 4 }; static const struct wacom_features wacom_features_0x325 = { "Wacom ISDv5 325", 59552, 33848, 2047, 63, CINTIQ_COMPANION_2, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 11, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, WACOM_CINTIQ_OFFSET, .oVid = USB_VENDOR_ID_WACOM, .oPid = 0x326 }; static const struct wacom_features wacom_features_0x326 = /* Touch */ { "Wacom ISDv5 326", .type = HID_GENERIC, .oVid = USB_VENDOR_ID_WACOM, .oPid = 0x325 }; static const struct wacom_features wacom_features_0x323 = { "Wacom Intuos P M", 21600, 13500, 1023, 31, INTUOSHT, WACOM_INTUOS_RES, WACOM_INTUOS_RES, .check_for_hid_type = true, .hid_type = HID_TYPE_USBNONE }; static const struct wacom_features wacom_features_0x331 = { "Wacom Express Key Remote", .type = REMOTE, .numbered_buttons = 18, .check_for_hid_type = true, .hid_type = HID_TYPE_USBNONE }; static const struct wacom_features wacom_features_0x33B = { "Wacom Intuos S 2", 15200, 9500, 2047, 63, INTUOSHT2, WACOM_INTUOS_RES, WACOM_INTUOS_RES, .check_for_hid_type = true, .hid_type = HID_TYPE_USBNONE }; static const struct wacom_features wacom_features_0x33C = { "Wacom Intuos PT S 2", 15200, 9500, 2047, 63, INTUOSHT2, WACOM_INTUOS_RES, WACOM_INTUOS_RES, .touch_max = 16, .check_for_hid_type = true, .hid_type = HID_TYPE_USBNONE }; static const struct wacom_features wacom_features_0x33D = { "Wacom Intuos P M 2", 21600, 13500, 2047, 63, INTUOSHT2, WACOM_INTUOS_RES, WACOM_INTUOS_RES, .check_for_hid_type = true, .hid_type = HID_TYPE_USBNONE }; static const struct wacom_features wacom_features_0x33E = { "Wacom Intuos PT M 2", 21600, 13500, 2047, 63, INTUOSHT2, WACOM_INTUOS_RES, WACOM_INTUOS_RES, .touch_max = 16, .check_for_hid_type = true, .hid_type = HID_TYPE_USBNONE }; static const struct wacom_features wacom_features_0x343 = { "Wacom DTK1651", 34816, 19759, 1023, 0, DTUS, WACOM_INTUOS_RES, WACOM_INTUOS_RES, 4, WACOM_DTU_OFFSET, WACOM_DTU_OFFSET, WACOM_DTU_OFFSET, WACOM_DTU_OFFSET }; static const struct wacom_features wacom_features_0x360 = { "Wacom Intuos Pro M", 44800, 29600, 8191, 63, INTUOSP2_BT, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 9, .touch_max = 10 }; static const struct wacom_features wacom_features_0x361 = { "Wacom Intuos Pro L", 62200, 43200, 8191, 63, INTUOSP2_BT, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 9, .touch_max = 10 }; static const struct wacom_features wacom_features_0x377 = { "Wacom Intuos BT S", 15200, 9500, 4095, 63, INTUOSHT3_BT, WACOM_INTUOS_RES, WACOM_INTUOS_RES, 4 }; static const struct wacom_features wacom_features_0x379 = { "Wacom Intuos BT M", 21600, 13500, 4095, 63, INTUOSHT3_BT, WACOM_INTUOS_RES, WACOM_INTUOS_RES, 4 }; static const struct wacom_features wacom_features_0x37A = { "Wacom One by Wacom S", 15200, 9500, 2047, 63, BAMBOO_PEN, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0x37B = { "Wacom One by Wacom M", 21600, 13500, 2047, 63, BAMBOO_PEN, WACOM_INTUOS_RES, WACOM_INTUOS_RES }; static const struct wacom_features wacom_features_0x393 = { "Wacom Intuos Pro S", 31920, 19950, 8191, 63, INTUOSP2S_BT, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 7, .touch_max = 10 }; static const struct wacom_features wacom_features_0x3c6 = { "Wacom Intuos BT S", 15200, 9500, 4095, 63, INTUOSHT3_BT, WACOM_INTUOS_RES, WACOM_INTUOS_RES, 4 }; static const struct wacom_features wacom_features_0x3c8 = { "Wacom Intuos BT M", 21600, 13500, 4095, 63, INTUOSHT3_BT, WACOM_INTUOS_RES, WACOM_INTUOS_RES, 4 }; static const struct wacom_features wacom_features_0x3dd = { "Wacom Intuos Pro S", 31920, 19950, 8191, 63, INTUOSP2S_BT, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 7, .touch_max = 10 }; static const struct wacom_features wacom_features_HID_ANY_ID = { "Wacom HID", .type = HID_GENERIC, .oVid = HID_ANY_ID, .oPid = HID_ANY_ID }; static const struct wacom_features wacom_features_0x94 = { "Wacom Bootloader", .type = BOOTLOADER }; #define USB_DEVICE_WACOM(prod) \ HID_DEVICE(BUS_USB, HID_GROUP_WACOM, USB_VENDOR_ID_WACOM, prod),\ .driver_data = (kernel_ulong_t)&wacom_features_##prod #define BT_DEVICE_WACOM(prod) \ HID_DEVICE(BUS_BLUETOOTH, HID_GROUP_WACOM, USB_VENDOR_ID_WACOM, prod),\ .driver_data = (kernel_ulong_t)&wacom_features_##prod #define I2C_DEVICE_WACOM(prod) \ HID_DEVICE(BUS_I2C, HID_GROUP_WACOM, USB_VENDOR_ID_WACOM, prod),\ .driver_data = (kernel_ulong_t)&wacom_features_##prod #define PCI_DEVICE_WACOM(prod) \ HID_DEVICE(BUS_PCI, HID_GROUP_WACOM, USB_VENDOR_ID_WACOM, prod),\ .driver_data = (kernel_ulong_t)&wacom_features_##prod #define USB_DEVICE_LENOVO(prod) \ HID_USB_DEVICE(USB_VENDOR_ID_LENOVO, prod), \ .driver_data = (kernel_ulong_t)&wacom_features_##prod const struct hid_device_id wacom_ids[] = { { USB_DEVICE_WACOM(0x00) }, { USB_DEVICE_WACOM(0x03) }, { USB_DEVICE_WACOM(0x10) }, { USB_DEVICE_WACOM(0x11) }, { USB_DEVICE_WACOM(0x12) }, { USB_DEVICE_WACOM(0x13) }, { USB_DEVICE_WACOM(0x14) }, { USB_DEVICE_WACOM(0x15) }, { USB_DEVICE_WACOM(0x16) }, { USB_DEVICE_WACOM(0x17) }, { USB_DEVICE_WACOM(0x18) }, { USB_DEVICE_WACOM(0x19) }, { USB_DEVICE_WACOM(0x20) }, { USB_DEVICE_WACOM(0x21) }, { USB_DEVICE_WACOM(0x22) }, { USB_DEVICE_WACOM(0x23) }, { USB_DEVICE_WACOM(0x24) }, { USB_DEVICE_WACOM(0x26) }, { USB_DEVICE_WACOM(0x27) }, { USB_DEVICE_WACOM(0x28) }, { USB_DEVICE_WACOM(0x29) }, { USB_DEVICE_WACOM(0x2A) }, { USB_DEVICE_WACOM(0x30) }, { USB_DEVICE_WACOM(0x31) }, { USB_DEVICE_WACOM(0x32) }, { USB_DEVICE_WACOM(0x33) }, { USB_DEVICE_WACOM(0x34) }, { USB_DEVICE_WACOM(0x35) }, { USB_DEVICE_WACOM(0x37) }, { USB_DEVICE_WACOM(0x38) }, { USB_DEVICE_WACOM(0x39) }, { USB_DEVICE_WACOM(0x3F) }, { USB_DEVICE_WACOM(0x41) }, { USB_DEVICE_WACOM(0x42) }, { USB_DEVICE_WACOM(0x43) }, { USB_DEVICE_WACOM(0x44) }, { USB_DEVICE_WACOM(0x45) }, { USB_DEVICE_WACOM(0x47) }, { USB_DEVICE_WACOM(0x57) }, { USB_DEVICE_WACOM(0x59) }, { USB_DEVICE_WACOM(0x5B) }, { USB_DEVICE_WACOM(0x5D) }, { USB_DEVICE_WACOM(0x5E) }, { USB_DEVICE_WACOM(0x60) }, { USB_DEVICE_WACOM(0x61) }, { USB_DEVICE_WACOM(0x62) }, { USB_DEVICE_WACOM(0x63) }, { USB_DEVICE_WACOM(0x64) }, { USB_DEVICE_WACOM(0x65) }, { USB_DEVICE_WACOM(0x69) }, { USB_DEVICE_WACOM(0x6A) }, { USB_DEVICE_WACOM(0x6B) }, { BT_DEVICE_WACOM(0x81) }, { USB_DEVICE_WACOM(0x84) }, { USB_DEVICE_WACOM(0x90) }, { USB_DEVICE_WACOM(0x93) }, { USB_DEVICE_WACOM(0x94) }, { USB_DEVICE_WACOM(0x97) }, { USB_DEVICE_WACOM(0x9A) }, { USB_DEVICE_WACOM(0x9F) }, { USB_DEVICE_WACOM(0xB0) }, { USB_DEVICE_WACOM(0xB1) }, { USB_DEVICE_WACOM(0xB2) }, { USB_DEVICE_WACOM(0xB3) }, { USB_DEVICE_WACOM(0xB4) }, { USB_DEVICE_WACOM(0xB5) }, { USB_DEVICE_WACOM(0xB7) }, { USB_DEVICE_WACOM(0xB8) }, { USB_DEVICE_WACOM(0xB9) }, { USB_DEVICE_WACOM(0xBA) }, { USB_DEVICE_WACOM(0xBB) }, { USB_DEVICE_WACOM(0xBC) }, { BT_DEVICE_WACOM(0xBD) }, { USB_DEVICE_WACOM(0xC0) }, { USB_DEVICE_WACOM(0xC2) }, { USB_DEVICE_WACOM(0xC4) }, { USB_DEVICE_WACOM(0xC5) }, { USB_DEVICE_WACOM(0xC6) }, { USB_DEVICE_WACOM(0xC7) }, { USB_DEVICE_WACOM(0xCC) }, { USB_DEVICE_WACOM(0xCE) }, { USB_DEVICE_WACOM(0xD0) }, { USB_DEVICE_WACOM(0xD1) }, { USB_DEVICE_WACOM(0xD2) }, { USB_DEVICE_WACOM(0xD3) }, { USB_DEVICE_WACOM(0xD4) }, { USB_DEVICE_WACOM(0xD5) }, { USB_DEVICE_WACOM(0xD6) }, { USB_DEVICE_WACOM(0xD7) }, { USB_DEVICE_WACOM(0xD8) }, { USB_DEVICE_WACOM(0xDA) }, { USB_DEVICE_WACOM(0xDB) }, { USB_DEVICE_WACOM(0xDD) }, { USB_DEVICE_WACOM(0xDE) }, { USB_DEVICE_WACOM(0xDF) }, { USB_DEVICE_WACOM(0xE2) }, { USB_DEVICE_WACOM(0xE3) }, { USB_DEVICE_WACOM(0xE5) }, { USB_DEVICE_WACOM(0xE6) }, { USB_DEVICE_WACOM(0xEC) }, { USB_DEVICE_WACOM(0xED) }, { USB_DEVICE_WACOM(0xEF) }, { USB_DEVICE_WACOM(0xF0) }, { USB_DEVICE_WACOM(0xF4) }, { USB_DEVICE_WACOM(0xF6) }, { USB_DEVICE_WACOM(0xF8) }, { USB_DEVICE_WACOM(0xFA) }, { USB_DEVICE_WACOM(0xFB) }, { USB_DEVICE_WACOM(0x100) }, { USB_DEVICE_WACOM(0x101) }, { USB_DEVICE_WACOM(0x10D) }, { USB_DEVICE_WACOM(0x10E) }, { USB_DEVICE_WACOM(0x10F) }, { USB_DEVICE_WACOM(0x116) }, { USB_DEVICE_WACOM(0x12C) }, { USB_DEVICE_WACOM(0x300) }, { USB_DEVICE_WACOM(0x301) }, { USB_DEVICE_WACOM(0x302) }, { USB_DEVICE_WACOM(0x303) }, { USB_DEVICE_WACOM(0x304) }, { USB_DEVICE_WACOM(0x307) }, { USB_DEVICE_WACOM(0x309) }, { USB_DEVICE_WACOM(0x30A) }, { USB_DEVICE_WACOM(0x30C) }, { USB_DEVICE_WACOM(0x30E) }, { USB_DEVICE_WACOM(0x314) }, { USB_DEVICE_WACOM(0x315) }, { USB_DEVICE_WACOM(0x317) }, { USB_DEVICE_WACOM(0x318) }, { USB_DEVICE_WACOM(0x319) }, { USB_DEVICE_WACOM(0x323) }, { USB_DEVICE_WACOM(0x325) }, { USB_DEVICE_WACOM(0x326) }, { USB_DEVICE_WACOM(0x32A) }, { USB_DEVICE_WACOM(0x32B) }, { USB_DEVICE_WACOM(0x32C) }, { USB_DEVICE_WACOM(0x32F) }, { USB_DEVICE_WACOM(0x331) }, { USB_DEVICE_WACOM(0x333) }, { USB_DEVICE_WACOM(0x335) }, { USB_DEVICE_WACOM(0x336) }, { USB_DEVICE_WACOM(0x33B) }, { USB_DEVICE_WACOM(0x33C) }, { USB_DEVICE_WACOM(0x33D) }, { USB_DEVICE_WACOM(0x33E) }, { USB_DEVICE_WACOM(0x343) }, { BT_DEVICE_WACOM(0x360) }, { BT_DEVICE_WACOM(0x361) }, { BT_DEVICE_WACOM(0x377) }, { BT_DEVICE_WACOM(0x379) }, { USB_DEVICE_WACOM(0x37A) }, { USB_DEVICE_WACOM(0x37B) }, { BT_DEVICE_WACOM(0x393) }, { BT_DEVICE_WACOM(0x3c6) }, { BT_DEVICE_WACOM(0x3c8) }, { BT_DEVICE_WACOM(0x3dd) }, { USB_DEVICE_WACOM(0x4001) }, { USB_DEVICE_WACOM(0x4004) }, { USB_DEVICE_WACOM(0x5000) }, { USB_DEVICE_WACOM(0x5002) }, { USB_DEVICE_LENOVO(0x6004) }, { USB_DEVICE_WACOM(HID_ANY_ID) }, { I2C_DEVICE_WACOM(HID_ANY_ID) }, { PCI_DEVICE_WACOM(HID_ANY_ID) }, { BT_DEVICE_WACOM(HID_ANY_ID) }, { } }; MODULE_DEVICE_TABLE(hid, wacom_ids);
636 6 138 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 /* SPDX-License-Identifier: GPL-2.0 */ /* * This file provides wrappers with sanitizer instrumentation for bit * locking operations. * * To use this functionality, an arch's bitops.h file needs to define each of * the below bit operations with an arch_ prefix (e.g. arch_set_bit(), * arch___set_bit(), etc.). */ #ifndef _ASM_GENERIC_BITOPS_INSTRUMENTED_LOCK_H #define _ASM_GENERIC_BITOPS_INSTRUMENTED_LOCK_H #include <linux/instrumented.h> /** * clear_bit_unlock - Clear a bit in memory, for unlock * @nr: the bit to set * @addr: the address to start counting from * * This operation is atomic and provides release barrier semantics. */ static inline void clear_bit_unlock(long nr, volatile unsigned long *addr) { kcsan_release(); instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long)); arch_clear_bit_unlock(nr, addr); } /** * __clear_bit_unlock - Clears a bit in memory * @nr: Bit to clear * @addr: Address to start counting from * * This is a non-atomic operation but implies a release barrier before the * memory operation. It can be used for an unlock if no other CPUs can * concurrently modify other bits in the word. */ static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) { kcsan_release(); instrument_write(addr + BIT_WORD(nr), sizeof(long)); arch___clear_bit_unlock(nr, addr); } /** * test_and_set_bit_lock - Set a bit and return its old value, for lock * @nr: Bit to set * @addr: Address to count from * * This operation is atomic and provides acquire barrier semantics if * the returned value is 0. * It can be used to implement bit locks. */ static inline bool test_and_set_bit_lock(long nr, volatile unsigned long *addr) { instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long)); return arch_test_and_set_bit_lock(nr, addr); } /** * xor_unlock_is_negative_byte - XOR a single byte in memory and test if * it is negative, for unlock. * @mask: Change the bits which are set in this mask. * @addr: The address of the word containing the byte to change. * * Changes some of bits 0-6 in the word pointed to by @addr. * This operation is atomic and provides release barrier semantics. * Used to optimise some folio operations which are commonly paired * with an unlock or end of writeback. Bit 7 is used as PG_waiters to * indicate whether anybody is waiting for the unlock. * * Return: Whether the top bit of the byte is set. */ static inline bool xor_unlock_is_negative_byte(unsigned long mask, volatile unsigned long *addr) { kcsan_release(); instrument_atomic_write(addr, sizeof(long)); return arch_xor_unlock_is_negative_byte(mask, addr); } #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_LOCK_H */
617 308 32 404 406 338 86 309 309 23 289 8 2 7 309 309 32 246 128 309 306 3 32 3 32 23 289 307 32 134 17 293 309 617 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2002 International Business Machines, Corp. * * This file is part of the SCTP kernel implementation * * These functions are the methods for accessing the SCTP inqueue. * * An SCTP inqueue is a queue into which you push SCTP packets * (which might be bundles or fragments of chunks) and out of which you * pop SCTP whole chunks. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <linux/interrupt.h> #include <linux/slab.h> /* Initialize an SCTP inqueue. */ void sctp_inq_init(struct sctp_inq *queue) { INIT_LIST_HEAD(&queue->in_chunk_list); queue->in_progress = NULL; /* Create a task for delivering data. */ INIT_WORK(&queue->immediate, NULL); } /* Properly release the chunk which is being worked on. */ static inline void sctp_inq_chunk_free(struct sctp_chunk *chunk) { if (chunk->head_skb) chunk->skb = chunk->head_skb; sctp_chunk_free(chunk); } /* Release the memory associated with an SCTP inqueue. */ void sctp_inq_free(struct sctp_inq *queue) { struct sctp_chunk *chunk, *tmp; /* Empty the queue. */ list_for_each_entry_safe(chunk, tmp, &queue->in_chunk_list, list) { list_del_init(&chunk->list); sctp_chunk_free(chunk); } /* If there is a packet which is currently being worked on, * free it as well. */ if (queue->in_progress) { sctp_inq_chunk_free(queue->in_progress); queue->in_progress = NULL; } } /* Put a new packet in an SCTP inqueue. * We assume that packet->sctp_hdr is set and in host byte order. */ void sctp_inq_push(struct sctp_inq *q, struct sctp_chunk *chunk) { /* Directly call the packet handling routine. */ if (chunk->rcvr->dead) { sctp_chunk_free(chunk); return; } /* We are now calling this either from the soft interrupt * or from the backlog processing. * Eventually, we should clean up inqueue to not rely * on the BH related data structures. */ list_add_tail(&chunk->list, &q->in_chunk_list); if (chunk->asoc) chunk->asoc->stats.ipackets++; q->immediate.func(&q->immediate); } /* Peek at the next chunk on the inqeue. */ struct sctp_chunkhdr *sctp_inq_peek(struct sctp_inq *queue) { struct sctp_chunk *chunk; struct sctp_chunkhdr *ch = NULL; chunk = queue->in_progress; /* If there is no more chunks in this packet, say so */ if (chunk->singleton || chunk->end_of_packet || chunk->pdiscard) return NULL; ch = (struct sctp_chunkhdr *)chunk->chunk_end; return ch; } /* Extract a chunk from an SCTP inqueue. * * WARNING: If you need to put the chunk on another queue, you need to * make a shallow copy (clone) of it. */ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue) { struct sctp_chunk *chunk; struct sctp_chunkhdr *ch = NULL; /* The assumption is that we are safe to process the chunks * at this time. */ chunk = queue->in_progress; if (chunk) { /* There is a packet that we have been working on. * Any post processing work to do before we move on? */ if (chunk->singleton || chunk->end_of_packet || chunk->pdiscard) { if (chunk->head_skb == chunk->skb) { chunk->skb = skb_shinfo(chunk->skb)->frag_list; goto new_skb; } if (chunk->skb->next) { chunk->skb = chunk->skb->next; goto new_skb; } sctp_inq_chunk_free(chunk); chunk = queue->in_progress = NULL; } else { /* Nothing to do. Next chunk in the packet, please. */ ch = (struct sctp_chunkhdr *)chunk->chunk_end; /* Force chunk->skb->data to chunk->chunk_end. */ skb_pull(chunk->skb, chunk->chunk_end - chunk->skb->data); /* We are guaranteed to pull a SCTP header. */ } } /* Do we need to take the next packet out of the queue to process? */ if (!chunk) { struct list_head *entry; next_chunk: /* Is the queue empty? */ entry = sctp_list_dequeue(&queue->in_chunk_list); if (!entry) return NULL; chunk = list_entry(entry, struct sctp_chunk, list); if (skb_is_gso(chunk->skb) && skb_is_gso_sctp(chunk->skb)) { /* GSO-marked skbs but without frags, handle * them normally */ if (skb_shinfo(chunk->skb)->frag_list) chunk->head_skb = chunk->skb; /* skbs with "cover letter" */ if (chunk->head_skb && chunk->skb->data_len == chunk->skb->len) { if (WARN_ON(!skb_shinfo(chunk->skb)->frag_list)) { __SCTP_INC_STATS(dev_net(chunk->skb->dev), SCTP_MIB_IN_PKT_DISCARDS); sctp_chunk_free(chunk); goto next_chunk; } chunk->skb = skb_shinfo(chunk->skb)->frag_list; } } if (chunk->asoc) sock_rps_save_rxhash(chunk->asoc->base.sk, chunk->skb); queue->in_progress = chunk; new_skb: /* This is the first chunk in the packet. */ ch = (struct sctp_chunkhdr *)chunk->skb->data; chunk->singleton = 1; chunk->data_accepted = 0; chunk->pdiscard = 0; chunk->auth = 0; chunk->has_asconf = 0; chunk->end_of_packet = 0; if (chunk->head_skb) { struct sctp_input_cb *cb = SCTP_INPUT_CB(chunk->skb), *head_cb = SCTP_INPUT_CB(chunk->head_skb); cb->chunk = head_cb->chunk; cb->af = head_cb->af; } } chunk->chunk_hdr = ch; chunk->chunk_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length)); skb_pull(chunk->skb, sizeof(*ch)); chunk->subh.v = NULL; /* Subheader is no longer valid. */ if (chunk->chunk_end + sizeof(*ch) <= skb_tail_pointer(chunk->skb)) { /* This is not a singleton */ chunk->singleton = 0; } else if (chunk->chunk_end > skb_tail_pointer(chunk->skb)) { /* Discard inside state machine. */ chunk->pdiscard = 1; chunk->chunk_end = skb_tail_pointer(chunk->skb); } else { /* We are at the end of the packet, so mark the chunk * in case we need to send a SACK. */ chunk->end_of_packet = 1; } pr_debug("+++sctp_inq_pop+++ chunk:%p[%s], length:%d, skb->len:%d\n", chunk, sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)), ntohs(chunk->chunk_hdr->length), chunk->skb->len); return chunk; } /* Set a top-half handler. * * Originally, we the top-half handler was scheduled as a BH. We now * call the handler directly in sctp_inq_push() at a time that * we know we are lock safe. * The intent is that this routine will pull stuff out of the * inqueue and process it. */ void sctp_inq_set_th_handler(struct sctp_inq *q, work_func_t callback) { INIT_WORK(&q->immediate, callback); }
58 257 8 8 4 164 178 1 1 1 2 8 172 172 4 4 5 5 2 8 8 8 3 3 2 2 2 7 7 3 4 7 7 7 2 4 1 1 1 1 1 1 1 5 1 1 1 2 2 2 2 2 2 2 2 3 2 1 1 6 6 1 1 1 1 1 1 2 2 2 1 2 2 2 1 1 1 1 1 1 1 88 87 87 1 1 1 1 1 88 89 88 2 87 2 2 2 2 2 6 1 2 1 2 3 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 4 4 4 4 4 7 7 158 159 99 99 100 101 76 76 46 48 1 1 234 233 1 211 213 28 27 1 118 121 1 1 1 1 1 1 173 174 1 1 1 1 50 51 50 245 88 88 3 3 167 166 4 87 87 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2011 STRATO. All rights reserved. */ #include <linux/sched.h> #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/rbtree.h> #include <linux/slab.h> #include <linux/workqueue.h> #include <linux/btrfs.h> #include <linux/sched/mm.h> #include "ctree.h" #include "transaction.h" #include "disk-io.h" #include "locking.h" #include "ulist.h" #include "backref.h" #include "extent_io.h" #include "qgroup.h" #include "block-group.h" #include "sysfs.h" #include "tree-mod-log.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" #include "tree-checker.h" enum btrfs_qgroup_mode btrfs_qgroup_mode(const struct btrfs_fs_info *fs_info) { if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) return BTRFS_QGROUP_MODE_DISABLED; if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) return BTRFS_QGROUP_MODE_SIMPLE; return BTRFS_QGROUP_MODE_FULL; } bool btrfs_qgroup_enabled(const struct btrfs_fs_info *fs_info) { return btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_DISABLED; } bool btrfs_qgroup_full_accounting(const struct btrfs_fs_info *fs_info) { return btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL; } /* * Helpers to access qgroup reservation * * Callers should ensure the lock context and type are valid */ static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup) { u64 ret = 0; int i; for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) ret += qgroup->rsv.values[i]; return ret; } #ifdef CONFIG_BTRFS_DEBUG static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type) { if (type == BTRFS_QGROUP_RSV_DATA) return "data"; if (type == BTRFS_QGROUP_RSV_META_PERTRANS) return "meta_pertrans"; if (type == BTRFS_QGROUP_RSV_META_PREALLOC) return "meta_prealloc"; return NULL; } #endif static void qgroup_rsv_add(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup, u64 num_bytes, enum btrfs_qgroup_rsv_type type) { trace_btrfs_qgroup_update_reserve(fs_info, qgroup, num_bytes, type); qgroup->rsv.values[type] += num_bytes; } static void qgroup_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup, u64 num_bytes, enum btrfs_qgroup_rsv_type type) { trace_btrfs_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type); if (qgroup->rsv.values[type] >= num_bytes) { qgroup->rsv.values[type] -= num_bytes; return; } #ifdef CONFIG_BTRFS_DEBUG WARN_RATELIMIT(1, "qgroup %llu %s reserved space underflow, have %llu to free %llu", qgroup->qgroupid, qgroup_rsv_type_str(type), qgroup->rsv.values[type], num_bytes); #endif qgroup->rsv.values[type] = 0; } static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *dest, const struct btrfs_qgroup *src) { int i; for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i); } static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *dest, const struct btrfs_qgroup *src) { int i; for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i); } static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq, int mod) { if (qg->old_refcnt < seq) qg->old_refcnt = seq; qg->old_refcnt += mod; } static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq, int mod) { if (qg->new_refcnt < seq) qg->new_refcnt = seq; qg->new_refcnt += mod; } static inline u64 btrfs_qgroup_get_old_refcnt(const struct btrfs_qgroup *qg, u64 seq) { if (qg->old_refcnt < seq) return 0; return qg->old_refcnt - seq; } static inline u64 btrfs_qgroup_get_new_refcnt(const struct btrfs_qgroup *qg, u64 seq) { if (qg->new_refcnt < seq) return 0; return qg->new_refcnt - seq; } static int qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, int init_flags); static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info); static int btrfs_qgroup_qgroupid_key_cmp(const void *key, const struct rb_node *node) { const u64 *qgroupid = key; const struct btrfs_qgroup *qgroup = rb_entry(node, struct btrfs_qgroup, node); if (qgroup->qgroupid < *qgroupid) return -1; else if (qgroup->qgroupid > *qgroupid) return 1; return 0; } /* must be called with qgroup_ioctl_lock held */ static struct btrfs_qgroup *find_qgroup_rb(const struct btrfs_fs_info *fs_info, u64 qgroupid) { struct rb_node *node; node = rb_find(&qgroupid, &fs_info->qgroup_tree, btrfs_qgroup_qgroupid_key_cmp); return rb_entry_safe(node, struct btrfs_qgroup, node); } static int btrfs_qgroup_qgroupid_cmp(struct rb_node *new, const struct rb_node *existing) { const struct btrfs_qgroup *new_qgroup = rb_entry(new, struct btrfs_qgroup, node); return btrfs_qgroup_qgroupid_key_cmp(&new_qgroup->qgroupid, existing); } /* * Add qgroup to the filesystem's qgroup tree. * * Must be called with qgroup_lock held and @prealloc preallocated. * * The control on the lifespan of @prealloc would be transferred to this * function, thus caller should no longer touch @prealloc. */ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *prealloc, u64 qgroupid) { struct rb_node *node; /* Caller must have pre-allocated @prealloc. */ ASSERT(prealloc); prealloc->qgroupid = qgroupid; node = rb_find_add(&prealloc->node, &fs_info->qgroup_tree, btrfs_qgroup_qgroupid_cmp); if (node) { kfree(prealloc); return rb_entry(node, struct btrfs_qgroup, node); } INIT_LIST_HEAD(&prealloc->groups); INIT_LIST_HEAD(&prealloc->members); INIT_LIST_HEAD(&prealloc->dirty); INIT_LIST_HEAD(&prealloc->iterator); INIT_LIST_HEAD(&prealloc->nested_iterator); return prealloc; } static void __del_qgroup_rb(struct btrfs_qgroup *qgroup) { struct btrfs_qgroup_list *list; list_del(&qgroup->dirty); while (!list_empty(&qgroup->groups)) { list = list_first_entry(&qgroup->groups, struct btrfs_qgroup_list, next_group); list_del(&list->next_group); list_del(&list->next_member); kfree(list); } while (!list_empty(&qgroup->members)) { list = list_first_entry(&qgroup->members, struct btrfs_qgroup_list, next_member); list_del(&list->next_group); list_del(&list->next_member); kfree(list); } } /* must be called with qgroup_lock held */ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) { struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid); if (!qgroup) return -ENOENT; rb_erase(&qgroup->node, &fs_info->qgroup_tree); __del_qgroup_rb(qgroup); return 0; } /* * Add relation specified by two qgroups. * * Must be called with qgroup_lock held, the ownership of @prealloc is * transferred to this function and caller should not touch it anymore. * * Return: 0 on success * -ENOENT if one of the qgroups is NULL * <0 other errors */ static int __add_relation_rb(struct btrfs_qgroup_list *prealloc, struct btrfs_qgroup *member, struct btrfs_qgroup *parent) { if (!member || !parent) { kfree(prealloc); return -ENOENT; } prealloc->group = parent; prealloc->member = member; list_add_tail(&prealloc->next_group, &member->groups); list_add_tail(&prealloc->next_member, &parent->members); return 0; } /* * Add relation specified by two qgroup ids. * * Must be called with qgroup_lock held. * * Return: 0 on success * -ENOENT if one of the ids does not exist * <0 other errors */ static int add_relation_rb(struct btrfs_fs_info *fs_info, struct btrfs_qgroup_list *prealloc, u64 memberid, u64 parentid) { struct btrfs_qgroup *member; struct btrfs_qgroup *parent; member = find_qgroup_rb(fs_info, memberid); parent = find_qgroup_rb(fs_info, parentid); return __add_relation_rb(prealloc, member, parent); } /* Must be called with qgroup_lock held */ static int del_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid) { struct btrfs_qgroup *member; struct btrfs_qgroup *parent; struct btrfs_qgroup_list *list; member = find_qgroup_rb(fs_info, memberid); parent = find_qgroup_rb(fs_info, parentid); if (!member || !parent) return -ENOENT; list_for_each_entry(list, &member->groups, next_group) { if (list->group == parent) { list_del(&list->next_group); list_del(&list->next_member); kfree(list); retu